ACPICA: All acpica: Update copyrights to 2020 Including tool signons.

ACPICA commit 8b9c69d0984067051ffbe8526f871448ead6a26b Link: https://github.com/acpica/acpica/commit/8b9c69d0 Signed-off-by: Bob Moore <robert.moore@intel.com> Signed-off-by: Erik Kaneda <erik.kaneda@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
author: Bob Moore <robert.moore@intel.com> 2020-01-10 11:31:49 -0800
committer: Rafael J. Wysocki <rafael.j.wysocki@intel.com> 2020-01-13 11:52:48 +0100
commit: 800ba7c5eaaa734e4bd66bf0441fc200bbcdca54 (patch)
tree: 5754a1e050b45d9e3be2f91f713b0687c61b0c4d /tools/perf/scripts/python/bin/export-to-postgresql-report
parent: fbdd256fe701a680f6eab3fa93dbab1942ab6a9f (diff)
0 files changed, 0 insertions, 0 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index eed551d8555f..633da5e37299 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -6,6 +6,7 @@
 
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/fs_struct.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 #include <linux/slab.h>
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index de009a33e0e2..f84412290a30 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -131,10 +131,9 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)
 			}
 		}
 		spin_unlock(&dentry->d_lock);
-	} else {
-		if (dentry->d_inode)
-			ret = v9fs_fid_find_inode(dentry->d_inode, false, uid, any);
 	}
+	if (!ret && dentry->d_inode)
+		ret = v9fs_fid_find_inode(dentry->d_inode, false, uid, any);
 
 	return ret;
 }
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 281a1ed03a04..057487efaaeb 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -13,7 +13,8 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/cred.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
+#include <linux/fs_context.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <net/9p/9p.h>
@@ -33,6 +34,10 @@ struct kmem_cache *v9fs_inode_cache;
  */
 
 enum {
+	/* Mount-point source, we need to handle this explicitly because
+	 * the code below accepts unknown args and the vfs layer only handles
+	 * source if we rejected it as EINVAL */
+	Opt_source,
 	/* Options that take integer arguments */
 	Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid,
 	/* String options */
@@ -43,27 +48,71 @@ enum {
 	Opt_access, Opt_posixacl,
 	/* Lock timeout option */
 	Opt_locktimeout,
-	/* Error token */
-	Opt_err
+
+	/* Client options */
+	Opt_msize, Opt_trans, Opt_legacy, Opt_version,
+
+	/* fd transport options */
+	/* Options that take integer arguments */
+	Opt_rfdno, Opt_wfdno,
+	/* Options that take no arguments */
+
+	/* rdma transport options */
+	/* Options that take integer arguments */
+	Opt_rq_depth, Opt_sq_depth, Opt_timeout,
+
+	/* Options for both fd and rdma transports */
+	Opt_port, Opt_privport,
 };
 
-static const match_table_t tokens = {
-	{Opt_debug, "debug=%x"},
-	{Opt_dfltuid, "dfltuid=%u"},
-	{Opt_dfltgid, "dfltgid=%u"},
-	{Opt_afid, "afid=%u"},
-	{Opt_uname, "uname=%s"},
-	{Opt_remotename, "aname=%s"},
-	{Opt_nodevmap, "nodevmap"},
-	{Opt_noxattr, "noxattr"},
-	{Opt_directio, "directio"},
-	{Opt_ignoreqv, "ignoreqv"},
-	{Opt_cache, "cache=%s"},
-	{Opt_cachetag, "cachetag=%s"},
-	{Opt_access, "access=%s"},
-	{Opt_posixacl, "posixacl"},
-	{Opt_locktimeout, "locktimeout=%u"},
-	{Opt_err, NULL}
+static const struct constant_table p9_versions[] = {
+	{ "9p2000",	p9_proto_legacy },
+	{ "9p2000.u",	p9_proto_2000u },
+	{ "9p2000.L",	p9_proto_2000L },
+	{}
+};
+
+/*
+ * This structure contains all parameters used for the core code,
+ * the client, and all the transports.
+ */
+const struct fs_parameter_spec v9fs_param_spec[] = {
+	fsparam_string	("source",	Opt_source),
+	fsparam_u32hex	("debug",	Opt_debug),
+	fsparam_uid	("dfltuid",	Opt_dfltuid),
+	fsparam_gid	("dfltgid",	Opt_dfltgid),
+	fsparam_u32	("afid",	Opt_afid),
+	fsparam_string	("uname",	Opt_uname),
+	fsparam_string	("aname",	Opt_remotename),
+	fsparam_flag	("nodevmap",	Opt_nodevmap),
+	fsparam_flag	("noxattr",	Opt_noxattr),
+	fsparam_flag	("directio",	Opt_directio),
+	fsparam_flag	("ignoreqv",	Opt_ignoreqv),
+	fsparam_string	("cache",	Opt_cache),
+	fsparam_string	("cachetag",	Opt_cachetag),
+	fsparam_string	("access",	Opt_access),
+	fsparam_flag	("posixacl",	Opt_posixacl),
+	fsparam_u32	("locktimeout",	Opt_locktimeout),
+
+	/* client options */
+	fsparam_u32	("msize",	Opt_msize),
+	fsparam_flag	("noextend",	Opt_legacy),
+	fsparam_string	("trans",	Opt_trans),
+	fsparam_enum	("version",	Opt_version, p9_versions),
+
+	/* fd transport options */
+	fsparam_u32	("rfdno",	Opt_rfdno),
+	fsparam_u32	("wfdno",	Opt_wfdno),
+
+	/* rdma transport options */
+	fsparam_u32	("sq",		Opt_sq_depth),
+	fsparam_u32	("rq",		Opt_rq_depth),
+	fsparam_u32	("timeout",	Opt_timeout),
+
+	/* fd and rdma transprt options */
+	fsparam_u32	("port",	Opt_port),
+	fsparam_flag	("privport",	Opt_privport),
+	{}
 };
 
 /* Interpret mount options for cache mode */
@@ -101,7 +150,7 @@ int v9fs_show_options(struct seq_file *m, struct dentry *root)
 	struct v9fs_session_info *v9ses = root->d_sb->s_fs_info;
 
 	if (v9ses->debug)
-		seq_printf(m, ",debug=%x", v9ses->debug);
+		seq_printf(m, ",debug=%#x", v9ses->debug);
 	if (!uid_eq(v9ses->dfltuid, V9FS_DEFUID))
 		seq_printf(m, ",dfltuid=%u",
 			   from_kuid_munged(&init_user_ns, v9ses->dfltuid));
@@ -117,7 +166,7 @@ int v9fs_show_options(struct seq_file *m, struct dentry *root)
 	if (v9ses->nodev)
 		seq_puts(m, ",nodevmap");
 	if (v9ses->cache)
-		seq_printf(m, ",cache=%x", v9ses->cache);
+		seq_printf(m, ",cache=%#x", v9ses->cache);
 #ifdef CONFIG_9P_FSCACHE
 	if (v9ses->cachetag && (v9ses->cache & CACHE_FSCACHE))
 		seq_printf(m, ",cachetag=%s", v9ses->cachetag);
@@ -153,267 +202,254 @@ int v9fs_show_options(struct seq_file *m, struct dentry *root)
 }
 
 /**
- * v9fs_parse_options - parse mount options into session structure
- * @v9ses: existing v9fs session information
- * @opts: The mount option string
+ * v9fs_parse_param - parse a mount option into the filesystem context
+ * @fc: the filesystem context
+ * @param: the parameter to parse
  *
  * Return 0 upon success, -ERRNO upon failure.
  */
-
-static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
+int v9fs_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
-	char *options, *tmp_options;
-	substring_t args[MAX_OPT_ARGS];
-	char *p;
-	int option = 0;
+	struct v9fs_context *ctx = fc->fs_private;
+	struct fs_parse_result result;
 	char *s;
-	int ret = 0;
-
-	/* setup defaults */
-	v9ses->afid = ~0;
-	v9ses->debug = 0;
-	v9ses->cache = CACHE_NONE;
-#ifdef CONFIG_9P_FSCACHE
-	v9ses->cachetag = NULL;
-#endif
-	v9ses->session_lock_timeout = P9_LOCK_TIMEOUT;
-
-	if (!opts)
-		return 0;
+	int r;
+	int opt;
+	struct p9_client_opts	*clnt = &ctx->client_opts;
+	struct p9_fd_opts	*fd_opts = &ctx->fd_opts;
+	struct p9_rdma_opts	*rdma_opts = &ctx->rdma_opts;
+	struct p9_session_opts	*session_opts = &ctx->session_opts;
+
+	opt = fs_parse(fc, v9fs_param_spec, param, &result);
+	if (opt < 0) {
+		/*
+		 * We might like to report bad mount options here, but
+		 * traditionally 9p has ignored unknown mount options
+		 */
+		if (opt == -ENOPARAM)
+			return 0;
 
-	tmp_options = kstrdup(opts, GFP_KERNEL);
-	if (!tmp_options) {
-		ret = -ENOMEM;
-		goto fail_option_alloc;
+		return opt;
 	}
-	options = tmp_options;
-
-	while ((p = strsep(&options, ",")) != NULL) {
-		int token, r;
-
-		if (!*p)
-			continue;
-
-		token = match_token(p, tokens, args);
-		switch (token) {
-		case Opt_debug:
-			r = match_int(&args[0], &option);
-			if (r < 0) {
-				p9_debug(P9_DEBUG_ERROR,
-					 "integer field, but no integer?\n");
-				ret = r;
-			} else {
-				v9ses->debug = option;
+
+	switch (opt) {
+	case Opt_source:
+		if (fc->source) {
+			pr_info("p9: multiple sources not supported\n");
+			return -EINVAL;
+		}
+		fc->source = param->string;
+		param->string = NULL;
+		break;
+	case Opt_debug:
+		session_opts->debug = result.uint_32;
 #ifdef CONFIG_NET_9P_DEBUG
-				p9_debug_level = option;
+		p9_debug_level = result.uint_32;
 #endif
-			}
-			break;
-
-		case Opt_dfltuid:
-			r = match_int(&args[0], &option);
-			if (r < 0) {
-				p9_debug(P9_DEBUG_ERROR,
-					 "integer field, but no integer?\n");
-				ret = r;
-				continue;
-			}
-			v9ses->dfltuid = make_kuid(current_user_ns(), option);
-			if (!uid_valid(v9ses->dfltuid)) {
-				p9_debug(P9_DEBUG_ERROR,
-					 "uid field, but not a uid?\n");
-				ret = -EINVAL;
-			}
-			break;
-		case Opt_dfltgid:
-			r = match_int(&args[0], &option);
-			if (r < 0) {
-				p9_debug(P9_DEBUG_ERROR,
-					 "integer field, but no integer?\n");
-				ret = r;
-				continue;
-			}
-			v9ses->dfltgid = make_kgid(current_user_ns(), option);
-			if (!gid_valid(v9ses->dfltgid)) {
-				p9_debug(P9_DEBUG_ERROR,
-					 "gid field, but not a gid?\n");
-				ret = -EINVAL;
-			}
-			break;
-		case Opt_afid:
-			r = match_int(&args[0], &option);
-			if (r < 0) {
-				p9_debug(P9_DEBUG_ERROR,
-					 "integer field, but no integer?\n");
-				ret = r;
-			} else {
-				v9ses->afid = option;
-			}
-			break;
-		case Opt_uname:
-			kfree(v9ses->uname);
-			v9ses->uname = match_strdup(&args[0]);
-			if (!v9ses->uname) {
-				ret = -ENOMEM;
-				goto free_and_return;
-			}
-			break;
-		case Opt_remotename:
-			kfree(v9ses->aname);
-			v9ses->aname = match_strdup(&args[0]);
-			if (!v9ses->aname) {
-				ret = -ENOMEM;
-				goto free_and_return;
-			}
-			break;
-		case Opt_nodevmap:
-			v9ses->nodev = 1;
-			break;
-		case Opt_noxattr:
-			v9ses->flags |= V9FS_NO_XATTR;
-			break;
-		case Opt_directio:
-			v9ses->flags |= V9FS_DIRECT_IO;
-			break;
-		case Opt_ignoreqv:
-			v9ses->flags |= V9FS_IGNORE_QV;
-			break;
-		case Opt_cachetag:
+		break;
+
+	case Opt_dfltuid:
+		session_opts->dfltuid = result.uid;
+		break;
+	case Opt_dfltgid:
+		session_opts->dfltgid = result.gid;
+		break;
+	case Opt_afid:
+		session_opts->afid = result.uint_32;
+		break;
+	case Opt_uname:
+		kfree(session_opts->uname);
+		session_opts->uname = param->string;
+		param->string = NULL;
+		break;
+	case Opt_remotename:
+		kfree(session_opts->aname);
+		session_opts->aname = param->string;
+		param->string = NULL;
+		break;
+	case Opt_nodevmap:
+		session_opts->nodev = 1;
+		break;
+	case Opt_noxattr:
+		session_opts->flags |= V9FS_NO_XATTR;
+		break;
+	case Opt_directio:
+		session_opts->flags |= V9FS_DIRECT_IO;
+		break;
+	case Opt_ignoreqv:
+		session_opts->flags |= V9FS_IGNORE_QV;
+		break;
+	case Opt_cachetag:
 #ifdef CONFIG_9P_FSCACHE
-			kfree(v9ses->cachetag);
-			v9ses->cachetag = match_strdup(&args[0]);
-			if (!v9ses->cachetag) {
-				ret = -ENOMEM;
-				goto free_and_return;
-			}
+		kfree(session_opts->cachetag);
+		session_opts->cachetag = param->string;
+		param->string = NULL;
 #endif
-			break;
-		case Opt_cache:
-			s = match_strdup(&args[0]);
-			if (!s) {
-				ret = -ENOMEM;
-				p9_debug(P9_DEBUG_ERROR,
-					 "problem allocating copy of cache arg\n");
-				goto free_and_return;
-			}
-			r = get_cache_mode(s);
-			if (r < 0)
-				ret = r;
-			else
-				v9ses->cache = r;
-
-			kfree(s);
-			break;
-
-		case Opt_access:
-			s = match_strdup(&args[0]);
-			if (!s) {
-				ret = -ENOMEM;
-				p9_debug(P9_DEBUG_ERROR,
-					 "problem allocating copy of access arg\n");
-				goto free_and_return;
+		break;
+	case Opt_cache:
+		r = get_cache_mode(param->string);
+		if (r < 0)
+			return r;
+		session_opts->cache = r;
+		break;
+	case Opt_access:
+		s = param->string;
+		session_opts->flags &= ~V9FS_ACCESS_MASK;
+		if (strcmp(s, "user") == 0) {
+			session_opts->flags |= V9FS_ACCESS_USER;
+		} else if (strcmp(s, "any") == 0) {
+			session_opts->flags |= V9FS_ACCESS_ANY;
+		} else if (strcmp(s, "client") == 0) {
+			session_opts->flags |= V9FS_ACCESS_CLIENT;
+		} else {
+			uid_t uid;
+
+			session_opts->flags |= V9FS_ACCESS_SINGLE;
+			r = kstrtouint(s, 10, &uid);
+			if (r) {
+				pr_info("Unknown access argument %s: %d\n",
+					param->string, r);
+				return r;
 			}
-
-			v9ses->flags &= ~V9FS_ACCESS_MASK;
-			if (strcmp(s, "user") == 0)
-				v9ses->flags |= V9FS_ACCESS_USER;
-			else if (strcmp(s, "any") == 0)
-				v9ses->flags |= V9FS_ACCESS_ANY;
-			else if (strcmp(s, "client") == 0) {
-				v9ses->flags |= V9FS_ACCESS_CLIENT;
-			} else {
-				uid_t uid;
-
-				v9ses->flags |= V9FS_ACCESS_SINGLE;
-				r = kstrtouint(s, 10, &uid);
-				if (r) {
-					ret = r;
-					pr_info("Unknown access argument %s: %d\n",
-						s, r);
-					kfree(s);
-					continue;
-				}
-				v9ses->uid = make_kuid(current_user_ns(), uid);
-				if (!uid_valid(v9ses->uid)) {
-					ret = -EINVAL;
-					pr_info("Unknown uid %s\n", s);
-				}
+			session_opts->uid = make_kuid(current_user_ns(), uid);
+			if (!uid_valid(session_opts->uid)) {
+				pr_info("Unknown uid %s\n", s);
+				return -EINVAL;
 			}
+		}
+		break;
 
-			kfree(s);
-			break;
-
-		case Opt_posixacl:
+	case Opt_posixacl:
 #ifdef CONFIG_9P_FS_POSIX_ACL
-			v9ses->flags |= V9FS_POSIX_ACL;
+		session_opts->flags |= V9FS_POSIX_ACL;
 #else
-			p9_debug(P9_DEBUG_ERROR,
-				 "Not defined CONFIG_9P_FS_POSIX_ACL. Ignoring posixacl option\n");
+		p9_debug(P9_DEBUG_ERROR,
+			 "Not defined CONFIG_9P_FS_POSIX_ACL. Ignoring posixacl option\n");
 #endif
-			break;
-
-		case Opt_locktimeout:
-			r = match_int(&args[0], &option);
-			if (r < 0) {
-				p9_debug(P9_DEBUG_ERROR,
-					 "integer field, but no integer?\n");
-				ret = r;
-				continue;
-			}
-			if (option < 1) {
-				p9_debug(P9_DEBUG_ERROR,
-					 "locktimeout must be a greater than zero integer.\n");
-				ret = -EINVAL;
-				continue;
-			}
-			v9ses->session_lock_timeout = (long)option * HZ;
-			break;
+		break;
 
-		default:
-			continue;
+	case Opt_locktimeout:
+		if (result.uint_32 < 1) {
+			p9_debug(P9_DEBUG_ERROR,
+				 "locktimeout must be a greater than zero integer.\n");
+			return -EINVAL;
+		}
+		session_opts->session_lock_timeout = (long)result.uint_32 * HZ;
+		break;
+
+	/* Options for client */
+	case Opt_msize:
+		if (result.uint_32 < 4096) {
+			p9_debug(P9_DEBUG_ERROR, "msize should be at least 4k\n");
+			return -EINVAL;
+		}
+		if (result.uint_32 > INT_MAX) {
+			p9_debug(P9_DEBUG_ERROR, "msize too big\n");
+			return -EINVAL;
 		}
+		clnt->msize = result.uint_32;
+		break;
+	case Opt_trans:
+		v9fs_put_trans(clnt->trans_mod);
+		clnt->trans_mod = v9fs_get_trans_by_name(param->string);
+		if (!clnt->trans_mod) {
+			pr_info("Could not find request transport: %s\n",
+				param->string);
+			return -EINVAL;
+		}
+		break;
+	case Opt_legacy:
+		clnt->proto_version = p9_proto_legacy;
+		break;
+	case Opt_version:
+		clnt->proto_version = result.uint_32;
+		p9_debug(P9_DEBUG_9P, "Protocol version: %s\n", param->string);
+		break;
+	/* Options for fd transport */
+	case Opt_rfdno:
+		fd_opts->rfd = result.uint_32;
+		break;
+	case Opt_wfdno:
+		fd_opts->wfd = result.uint_32;
+		break;
+	/* Options for rdma transport */
+	case Opt_sq_depth:
+		rdma_opts->sq_depth = result.uint_32;
+		break;
+	case Opt_rq_depth:
+		rdma_opts->rq_depth = result.uint_32;
+		break;
+	case Opt_timeout:
+		rdma_opts->timeout = result.uint_32;
+		break;
+	/* Options for both fd and rdma transports */
+	case Opt_port:
+		fd_opts->port = result.uint_32;
+		rdma_opts->port = result.uint_32;
+		break;
+	case Opt_privport:
+		fd_opts->privport = true;
+		rdma_opts->port = true;
+		break;
 	}
 
-free_and_return:
-	kfree(tmp_options);
-fail_option_alloc:
-	return ret;
+	return 0;
+}
+
+static void v9fs_apply_options(struct v9fs_session_info *v9ses,
+		  struct fs_context *fc)
+{
+	struct v9fs_context	*ctx = fc->fs_private;
+
+	v9ses->debug = ctx->session_opts.debug;
+	v9ses->dfltuid = ctx->session_opts.dfltuid;
+	v9ses->dfltgid = ctx->session_opts.dfltgid;
+	v9ses->afid = ctx->session_opts.afid;
+	v9ses->uname = ctx->session_opts.uname;
+	ctx->session_opts.uname = NULL;
+	v9ses->aname = ctx->session_opts.aname;
+	ctx->session_opts.aname = NULL;
+	v9ses->nodev = ctx->session_opts.nodev;
+	/*
+	 * Note that we must |= flags here as session_init already
+	 * set basic flags. This adds in flags from parsed options.
+	 */
+	v9ses->flags |= ctx->session_opts.flags;
+#ifdef CONFIG_9P_FSCACHE
+	v9ses->cachetag = ctx->session_opts.cachetag;
+	ctx->session_opts.cachetag = NULL;
+#endif
+	v9ses->cache = ctx->session_opts.cache;
+	v9ses->uid = ctx->session_opts.uid;
+	v9ses->session_lock_timeout = ctx->session_opts.session_lock_timeout;
 }
 
 /**
  * v9fs_session_init - initialize session
  * @v9ses: session information structure
- * @dev_name: device being mounted
- * @data: options
+ * @fc: the filesystem mount context
  *
  */
 
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
-		  const char *dev_name, char *data)
+		  struct fs_context *fc)
 {
 	struct p9_fid *fid;
 	int rc = -ENOMEM;
 
-	v9ses->uname = kstrdup(V9FS_DEFUSER, GFP_KERNEL);
-	if (!v9ses->uname)
-		goto err_names;
-
-	v9ses->aname = kstrdup(V9FS_DEFANAME, GFP_KERNEL);
-	if (!v9ses->aname)
-		goto err_names;
 	init_rwsem(&v9ses->rename_sem);
 
-	v9ses->uid = INVALID_UID;
-	v9ses->dfltuid = V9FS_DEFUID;
-	v9ses->dfltgid = V9FS_DEFGID;
-
-	v9ses->clnt = p9_client_create(dev_name, data);
+	v9ses->clnt = p9_client_create(fc);
 	if (IS_ERR(v9ses->clnt)) {
 		rc = PTR_ERR(v9ses->clnt);
 		p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n");
 		goto err_names;
 	}
 
+	/*
+	 * Initialize flags on the real v9ses. v9fs_apply_options below
+	 * will |= the additional flags from parsed options.
+	 */
 	v9ses->flags = V9FS_ACCESS_USER;
 
 	if (p9_is_proto_dotl(v9ses->clnt)) {
@@ -423,9 +459,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 		v9ses->flags |= V9FS_PROTO_2000U;
 	}
 
-	rc = v9fs_parse_options(v9ses, data);
-	if (rc < 0)
-		goto err_clnt;
+	v9fs_apply_options(v9ses, fc);
 
 	v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
 
@@ -438,8 +472,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 		v9ses->flags &= ~V9FS_ACCESS_MASK;
 		v9ses->flags |= V9FS_ACCESS_USER;
 	}
-	/*FIXME !! */
-	/* for legacy mode, fall back to V9FS_ACCESS_ANY */
+	/* FIXME: for legacy mode, fall back to V9FS_ACCESS_ANY */
 	if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) &&
 		((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {
 
@@ -450,7 +483,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	if (!v9fs_proto_dotl(v9ses) ||
 		!((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
 		/*
-		 * We support ACL checks on clinet only if the protocol is
+		 * We support ACL checks on client only if the protocol is
 		 * 9P2000.L and access is V9FS_ACCESS_CLIENT.
 		 */
 		v9ses->flags &= ~V9FS_ACL_MASK;
@@ -472,7 +505,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 #ifdef CONFIG_9P_FSCACHE
 	/* register the session for caching */
 	if (v9ses->cache & CACHE_FSCACHE) {
-		rc = v9fs_cache_session_get_cookie(v9ses, dev_name);
+		rc = v9fs_cache_session_get_cookie(v9ses, fc->source);
 		if (rc < 0)
 			goto err_clnt;
 	}
@@ -561,7 +594,7 @@ static ssize_t caches_show(struct kobject *kobj,
 	spin_lock(&v9fs_sessionlist_lock);
 	list_for_each_entry(v9ses, &v9fs_sessionlist, slist) {
 		if (v9ses->cachetag) {
-			n = snprintf(buf, limit, "%s\n", v9ses->cachetag);
+			n = snprintf(buf + count, limit, "%s\n", v9ses->cachetag);
 			if (n < 0) {
 				count = n;
 				break;
@@ -597,13 +630,16 @@ static const struct attribute_group v9fs_attr_group = {
 
 static int __init v9fs_sysfs_init(void)
 {
+	int ret;
+
 	v9fs_kobj = kobject_create_and_add("9p", fs_kobj);
 	if (!v9fs_kobj)
 		return -ENOMEM;
 
-	if (sysfs_create_group(v9fs_kobj, &v9fs_attr_group)) {
+	ret = sysfs_create_group(v9fs_kobj, &v9fs_attr_group);
+	if (ret) {
 		kobject_put(v9fs_kobj);
-		return -ENOMEM;
+		return ret;
 	}
 
 	return 0;
@@ -659,21 +695,6 @@ static void v9fs_destroy_inode_cache(void)
 	kmem_cache_destroy(v9fs_inode_cache);
 }
 
-static int v9fs_cache_register(void)
-{
-	int ret;
-
-	ret = v9fs_init_inode_cache();
-	if (ret < 0)
-		return ret;
-	return ret;
-}
-
-static void v9fs_cache_unregister(void)
-{
-	v9fs_destroy_inode_cache();
-}
-
 /**
  * init_v9fs - Initialize module
  *
@@ -684,9 +705,9 @@ static int __init init_v9fs(void)
 	int err;
 
 	pr_info("Installing v9fs 9p2000 file system support\n");
-	/* TODO: Setup list of registered trasnport modules */
+	/* TODO: Setup list of registered transport modules */
 
-	err = v9fs_cache_register();
+	err = v9fs_init_inode_cache();
 	if (err < 0) {
 		pr_err("Failed to register v9fs for caching\n");
 		return err;
@@ -709,7 +730,7 @@ out_sysfs_cleanup:
 	v9fs_sysfs_cleanup();
 
 out_cache:
-	v9fs_cache_unregister();
+	v9fs_destroy_inode_cache();
 
 	return err;
 }
@@ -722,7 +743,7 @@ out_cache:
 static void __exit exit_v9fs(void)
 {
 	v9fs_sysfs_cleanup();
-	v9fs_cache_unregister();
+	v9fs_destroy_inode_cache();
 	unregister_filesystem(&v9fs_fs_type);
 }
 
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 1775fcc7f0e8..6a12445d3858 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -10,6 +10,9 @@
 
 #include <linux/backing-dev.h>
 #include <linux/netfs.h>
+#include <linux/fs_parser.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
 
 /**
  * enum p9_session_flags - option flags for each 9P session
@@ -163,11 +166,13 @@ static inline struct fscache_volume *v9fs_session_cache(struct v9fs_session_info
 #endif
 }
 
+extern const struct fs_parameter_spec v9fs_param_spec[];
 
+extern int v9fs_parse_param(struct fs_context *fc, struct fs_parameter *param);
 extern int v9fs_show_options(struct seq_file *m, struct dentry *root);
 
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
-				 const char *dev_name, char *data);
+				 struct fs_context *fc);
 extern void v9fs_session_close(struct v9fs_session_info *v9ses);
 extern void v9fs_session_cancel(struct v9fs_session_info *v9ses);
 extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
@@ -179,14 +184,16 @@ extern int v9fs_vfs_rename(struct mnt_idmap *idmap,
 			   struct inode *old_dir, struct dentry *old_dentry,
 			   struct inode *new_dir, struct dentry *new_dentry,
 			   unsigned int flags);
-extern struct inode *v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid,
-						bool new);
+extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
+					 struct p9_fid *fid,
+					 struct super_block *sb, int new);
 extern const struct inode_operations v9fs_dir_inode_operations_dotl;
 extern const struct inode_operations v9fs_file_inode_operations_dotl;
 extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
 extern const struct netfs_request_ops v9fs_req_ops;
-extern struct inode *v9fs_fid_iget_dotl(struct super_block *sb,
-						struct p9_fid *fid, bool new);
+extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
+					      struct p9_fid *fid,
+					      struct super_block *sb, int new);
 
 /* other default globals */
 #define V9FS_PORT	564
@@ -200,7 +207,7 @@ static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
 	return inode->i_sb->s_fs_info;
 }
 
-static inline struct v9fs_session_info *v9fs_dentry2v9ses(struct dentry *dentry)
+static inline struct v9fs_session_info *v9fs_dentry2v9ses(const struct dentry *dentry)
 {
 	return dentry->d_sb->s_fs_info;
 }
@@ -225,12 +232,30 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
  */
 static inline struct inode *
 v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-			struct super_block *sb, bool new)
+			struct super_block *sb)
 {
 	if (v9fs_proto_dotl(v9ses))
-		return v9fs_fid_iget_dotl(sb, fid, new);
+		return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 0);
 	else
-		return v9fs_fid_iget(sb, fid, new);
+		return v9fs_inode_from_fid(v9ses, fid, sb, 0);
+}
+
+/**
+ * v9fs_get_new_inode_from_fid - Helper routine to populate an inode by
+ * issuing a attribute request
+ * @v9ses: session information
+ * @fid: fid to issue attribute request for
+ * @sb: superblock on which to create inode
+ *
+ */
+static inline struct inode *
+v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+			    struct super_block *sb)
+{
+	if (v9fs_proto_dotl(v9ses))
+		return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 1);
+	else
+		return v9fs_inode_from_fid(v9ses, fid, sb, 1);
 }
 
 #endif
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 7923c3c347cb..d3aefbec4de6 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -42,7 +42,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb);
 void v9fs_free_inode(struct inode *inode);
 void v9fs_set_netfs_context(struct inode *inode);
 int v9fs_init_inode(struct v9fs_session_info *v9ses,
-		    struct inode *inode, struct p9_qid *qid, umode_t mode, dev_t rdev);
+		    struct inode *inode, umode_t mode, dev_t rdev);
 void v9fs_evict_inode(struct inode *inode);
 #if (BITS_PER_LONG == 32)
 #define QID2INO(q) ((ino_t) (((q)->path+2) ^ (((q)->path) >> 32)))
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index a97ceb105cd8..862164181bac 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -57,7 +57,9 @@ static void v9fs_issue_write(struct netfs_io_subrequest *subreq)
 	int err, len;
 
 	len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err);
-	netfs_write_subrequest_terminated(subreq, len ?: err, false);
+	if (len > 0)
+		__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
+	netfs_write_subrequest_terminated(subreq, len ?: err);
 }
 
 /**
@@ -68,16 +70,25 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
 {
 	struct netfs_io_request *rreq = subreq->rreq;
 	struct p9_fid *fid = rreq->netfs_priv;
+	unsigned long long pos = subreq->start + subreq->transferred;
 	int total, err;
 
-	total = p9_client_read(fid, subreq->start + subreq->transferred,
-			       &subreq->io_iter, &err);
+	total = p9_client_read(fid, pos, &subreq->io_iter, &err);
 
 	/* if we just extended the file size, any portion not in
 	 * cache won't be on server and is zeroes */
-	__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+	if (subreq->rreq->origin != NETFS_UNBUFFERED_READ &&
+	    subreq->rreq->origin != NETFS_DIO_READ)
+		__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+	if (pos + total >= i_size_read(rreq->inode))
+		__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
+	if (!err && total) {
+		subreq->transferred += total;
+		__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
+	}
 
-	netfs_subreq_terminated(subreq, err ?: total, false);
+	subreq->error = err;
+	netfs_read_subreq_terminated(subreq);
 }
 
 /**
@@ -154,4 +165,5 @@ const struct address_space_operations v9fs_addr_operations = {
 	.invalidate_folio	= netfs_invalidate_folio,
 	.direct_IO		= noop_direct_IO,
 	.writepages		= netfs_writepages,
+	.migrate_folio		= filemap_migrate_folio,
 };
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index f16f73581634..c5bf74d547e8 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -48,15 +48,20 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry)
 static void v9fs_dentry_release(struct dentry *dentry)
 {
 	struct hlist_node *p, *n;
+	struct hlist_head head;
 
 	p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n",
 		 dentry, dentry);
-	hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata)
+
+	spin_lock(&dentry->d_lock);
+	hlist_move_list((struct hlist_head *)&dentry->d_fsdata, &head);
+	spin_unlock(&dentry->d_lock);
+
+	hlist_for_each_safe(p, n, &head)
 		p9_fid_put(hlist_entry(p, struct p9_fid, dlist));
-	dentry->d_fsdata = NULL;
 }
 
-static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+static int __v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct p9_fid *fid;
 	struct inode *inode;
@@ -75,8 +80,13 @@ static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 		struct v9fs_session_info *v9ses;
 
 		fid = v9fs_fid_lookup(dentry);
-		if (IS_ERR(fid))
+		if (IS_ERR(fid)) {
+			p9_debug(
+				P9_DEBUG_VFS,
+				"v9fs_fid_lookup: dentry = %pd (%p), got error %pe\n",
+				dentry, dentry, fid);
 			return PTR_ERR(fid);
+		}
 
 		v9ses = v9fs_inode2v9ses(inode);
 		if (v9fs_proto_dotl(v9ses))
@@ -85,23 +95,57 @@ static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 			retval = v9fs_refresh_inode(fid, inode);
 		p9_fid_put(fid);
 
-		if (retval == -ENOENT)
+		if (retval == -ENOENT) {
+			p9_debug(P9_DEBUG_VFS, "dentry: %pd (%p) invalidated due to ENOENT\n",
+				 dentry, dentry);
+			return 0;
+		}
+		if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) {
+			p9_debug(P9_DEBUG_VFS, "dentry: %pd (%p) invalidated due to type change\n",
+				 dentry, dentry);
 			return 0;
-		if (retval < 0)
+		}
+		if (retval < 0) {
+			p9_debug(P9_DEBUG_VFS,
+				"refresh inode: dentry = %pd (%p), got error %pe\n",
+				dentry, dentry, ERR_PTR(retval));
 			return retval;
+		}
 	}
 out_valid:
+	p9_debug(P9_DEBUG_VFS, "dentry: %pd (%p) is valid\n", dentry, dentry);
 	return 1;
 }
 
+static int v9fs_lookup_revalidate(struct inode *dir, const struct qstr *name,
+				  struct dentry *dentry, unsigned int flags)
+{
+	return __v9fs_lookup_revalidate(dentry, flags);
+}
+
+static bool v9fs_dentry_unalias_trylock(const struct dentry *dentry)
+{
+	struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry);
+	return down_write_trylock(&v9ses->rename_sem);
+}
+
+static void v9fs_dentry_unalias_unlock(const struct dentry *dentry)
+{
+	struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry);
+	up_write(&v9ses->rename_sem);
+}
+
 const struct dentry_operations v9fs_cached_dentry_operations = {
 	.d_revalidate = v9fs_lookup_revalidate,
-	.d_weak_revalidate = v9fs_lookup_revalidate,
+	.d_weak_revalidate = __v9fs_lookup_revalidate,
 	.d_delete = v9fs_cached_dentry_delete,
 	.d_release = v9fs_dentry_release,
+	.d_unalias_trylock = v9fs_dentry_unalias_trylock,
+	.d_unalias_unlock = v9fs_dentry_unalias_unlock,
 };
 
 const struct dentry_operations v9fs_dentry_operations = {
-	.d_delete = always_delete_dentry,
 	.d_release = v9fs_dentry_release,
+	.d_unalias_trylock = v9fs_dentry_unalias_trylock,
+	.d_unalias_unlock = v9fs_dentry_unalias_unlock,
 };
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 348cc90bf9c5..6f3880208587 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -43,14 +43,18 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid;
 	int omode;
+	int o_append;
 
 	p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
 	v9ses = v9fs_inode2v9ses(inode);
-	if (v9fs_proto_dotl(v9ses))
+	if (v9fs_proto_dotl(v9ses)) {
 		omode = v9fs_open_to_dotl_flags(file->f_flags);
-	else
+		o_append = P9_DOTL_APPEND;
+	} else {
 		omode = v9fs_uflags2omode(file->f_flags,
 					v9fs_proto_dotu(v9ses));
+		o_append = P9_OAPPEND;
+	}
 	fid = file->private_data;
 	if (!fid) {
 		fid = v9fs_fid_clone(file_dentry(file));
@@ -58,9 +62,10 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 			return PTR_ERR(fid);
 
 		if ((v9ses->cache & CACHE_WRITEBACK) && (omode & P9_OWRITE)) {
-			int writeback_omode = (omode & ~P9_OWRITE) | P9_ORDWR;
+			int writeback_omode = (omode & ~(P9_OWRITE | o_append)) | P9_ORDWR;
 
 			p9_debug(P9_DEBUG_CACHE, "write-only file with writeback enabled, try opening O_RDWR\n");
+
 			err = p9_client_open(fid, writeback_omode);
 			if (err < 0) {
 				p9_debug(P9_DEBUG_CACHE, "could not open O_RDWR, disabling caches\n");
@@ -454,9 +459,10 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
 }
 
 static int
-v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma)
+v9fs_file_mmap_prepare(struct vm_area_desc *desc)
 {
 	int retval;
+	struct file *filp = desc->file;
 	struct inode *inode = file_inode(filp);
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
 
@@ -464,12 +470,12 @@ v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma)
 
 	if (!(v9ses->cache & CACHE_WRITEBACK)) {
 		p9_debug(P9_DEBUG_CACHE, "(read-only mmap mode)");
-		return generic_file_readonly_mmap(filp, vma);
+		return generic_file_readonly_mmap_prepare(desc);
 	}
 
-	retval = generic_file_mmap(filp, vma);
+	retval = generic_file_mmap_prepare(desc);
 	if (!retval)
-		vma->vm_ops = &v9fs_mmap_file_vm_ops;
+		desc->vm_ops = &v9fs_mmap_file_vm_ops;
 
 	return retval;
 }
@@ -482,24 +488,15 @@ v9fs_vm_page_mkwrite(struct vm_fault *vmf)
 
 static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
 {
-	struct inode *inode;
-
-	struct writeback_control wbc = {
-		.nr_to_write = LONG_MAX,
-		.sync_mode = WB_SYNC_ALL,
-		.range_start = (loff_t)vma->vm_pgoff * PAGE_SIZE,
-		 /* absolute end, byte at end included */
-		.range_end = (loff_t)vma->vm_pgoff * PAGE_SIZE +
-			(vma->vm_end - vma->vm_start - 1),
-	};
-
 	if (!(vma->vm_flags & VM_SHARED))
 		return;
 
 	p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma);
 
-	inode = file_inode(vma->vm_file);
-	filemap_fdatawrite_wbc(inode->i_mapping, &wbc);
+	filemap_fdatawrite_range(file_inode(vma->vm_file)->i_mapping,
+			(loff_t)vma->vm_pgoff * PAGE_SIZE,
+			(loff_t)vma->vm_pgoff * PAGE_SIZE +
+				(vma->vm_end - vma->vm_start - 1));
 }
 
 static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
@@ -516,7 +513,7 @@ const struct file_operations v9fs_file_operations = {
 	.open = v9fs_file_open,
 	.release = v9fs_dir_release,
 	.lock = v9fs_file_lock,
-	.mmap = generic_file_readonly_mmap,
+	.mmap_prepare = generic_file_readonly_mmap_prepare,
 	.splice_read = v9fs_file_splice_read,
 	.splice_write = iter_file_splice_write,
 	.fsync = v9fs_file_fsync,
@@ -531,7 +528,7 @@ const struct file_operations v9fs_file_operations_dotl = {
 	.release = v9fs_dir_release,
 	.lock = v9fs_file_lock_dotl,
 	.flock = v9fs_file_flock_dotl,
-	.mmap = v9fs_file_mmap,
+	.mmap_prepare = v9fs_file_mmap_prepare,
 	.splice_read = v9fs_file_splice_read,
 	.splice_write = iter_file_splice_write,
 	.fsync = v9fs_file_fsync_dotl,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 7a3308d77606..97abe65bf7c1 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -256,12 +256,9 @@ void v9fs_set_netfs_context(struct inode *inode)
 }
 
 int v9fs_init_inode(struct v9fs_session_info *v9ses,
-		    struct inode *inode, struct p9_qid *qid, umode_t mode, dev_t rdev)
+		    struct inode *inode, umode_t mode, dev_t rdev)
 {
 	int err = 0;
-	struct v9fs_inode *v9inode = V9FS_I(inode);
-
-	memcpy(&v9inode->qid, qid, sizeof(struct p9_qid));
 
 	inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
 	inode->i_blocks = 0;
@@ -348,6 +345,7 @@ void v9fs_evict_inode(struct inode *inode)
 	__le32 __maybe_unused version;
 
 	if (!is_bad_inode(inode)) {
+		netfs_wait_for_outstanding_io(inode);
 		truncate_inode_pages_final(&inode->i_data);
 
 		version = cpu_to_le32(v9inode->qid.version);
@@ -364,59 +362,105 @@ void v9fs_evict_inode(struct inode *inode)
 		clear_inode(inode);
 }
 
-struct inode *
-v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid, bool new)
+static int v9fs_test_inode(struct inode *inode, void *data)
+{
+	int umode;
+	dev_t rdev;
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	struct p9_wstat *st = (struct p9_wstat *)data;
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
+
+	umode = p9mode2unixmode(v9ses, st, &rdev);
+	/* don't match inode of different type */
+	if (inode_wrong_type(inode, umode))
+		return 0;
+
+	/* compare qid details */
+	if (memcmp(&v9inode->qid.version,
+		   &st->qid.version, sizeof(v9inode->qid.version)))
+		return 0;
+
+	if (v9inode->qid.type != st->qid.type)
+		return 0;
+
+	if (v9inode->qid.path != st->qid.path)
+		return 0;
+	return 1;
+}
+
+static int v9fs_test_new_inode(struct inode *inode, void *data)
+{
+	return 0;
+}
+
+static int v9fs_set_inode(struct inode *inode,  void *data)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	struct p9_wstat *st = (struct p9_wstat *)data;
+
+	memcpy(&v9inode->qid, &st->qid, sizeof(st->qid));
+	return 0;
+}
+
+static struct inode *v9fs_qid_iget(struct super_block *sb,
+				   struct p9_qid *qid,
+				   struct p9_wstat *st,
+				   int new)
 {
 	dev_t rdev;
 	int retval;
 	umode_t umode;
 	struct inode *inode;
-	struct p9_wstat *st;
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
+	int (*test)(struct inode *inode, void *data);
 
-	inode = iget_locked(sb, QID2INO(&fid->qid));
-	if (unlikely(!inode))
-		return ERR_PTR(-ENOMEM);
-	if (!(inode->i_state & I_NEW)) {
-		if (!new) {
-			goto done;
-		} else {
-			p9_debug(P9_DEBUG_VFS, "WARNING: Inode collision %ld\n",
-						inode->i_ino);
-			iput(inode);
-			remove_inode_hash(inode);
-			inode = iget_locked(sb, QID2INO(&fid->qid));
-			WARN_ON(!(inode->i_state & I_NEW));
-		}
-	}
+	if (new)
+		test = v9fs_test_new_inode;
+	else
+		test = v9fs_test_inode;
 
+	inode = iget5_locked(sb, QID2INO(qid), test, v9fs_set_inode, st);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode_state_read_once(inode) & I_NEW))
+		return inode;
 	/*
 	 * initialize the inode with the stat info
 	 * FIXME!! we may need support for stale inodes
 	 * later.
 	 */
-	st = p9_client_stat(fid);
-	if (IS_ERR(st)) {
-		retval = PTR_ERR(st);
-		goto error;
-	}
-
+	inode->i_ino = QID2INO(qid);
 	umode = p9mode2unixmode(v9ses, st, &rdev);
-	retval = v9fs_init_inode(v9ses, inode, &fid->qid, umode, rdev);
-	v9fs_stat2inode(st, inode, sb, 0);
-	p9stat_free(st);
-	kfree(st);
+	retval = v9fs_init_inode(v9ses, inode, umode, rdev);
 	if (retval)
 		goto error;
 
+	v9fs_stat2inode(st, inode, sb, 0);
 	v9fs_set_netfs_context(inode);
 	v9fs_cache_inode_get_cookie(inode);
 	unlock_new_inode(inode);
-done:
 	return inode;
 error:
 	iget_failed(inode);
 	return ERR_PTR(retval);
+
+}
+
+struct inode *
+v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+		    struct super_block *sb, int new)
+{
+	struct p9_wstat *st;
+	struct inode *inode = NULL;
+
+	st = p9_client_stat(fid);
+	if (IS_ERR(st))
+		return ERR_CAST(st);
+
+	inode = v9fs_qid_iget(sb, &st->qid, st, new);
+	p9stat_free(st);
+	kfree(st);
+	return inode;
 }
 
 /**
@@ -448,15 +492,8 @@ static int v9fs_at_to_dotl_flags(int flags)
  */
 static void v9fs_dec_count(struct inode *inode)
 {
-	if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) {
-		if (inode->i_nlink) {
-			drop_nlink(inode);
-		} else {
-			p9_debug(P9_DEBUG_VFS,
-						"WARNING: unexpected i_nlink zero %d inode %ld\n",
-						inode->i_nlink, inode->i_ino);
-		}
-	}
+	if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
+		drop_nlink(inode);
 }
 
 /**
@@ -507,9 +544,6 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
 		} else
 			v9fs_dec_count(inode);
 
-		if (inode->i_nlink <= 0)	/* no more refs unhash it */
-			remove_inode_hash(inode);
-
 		v9fs_invalidate_inode_attr(inode);
 		v9fs_invalidate_inode_attr(dir);
 
@@ -575,7 +609,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 		/*
 		 * instantiate inode and assign the unopened fid to the dentry
 		 */
-		inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb, true);
+		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
 			p9_debug(P9_DEBUG_VFS,
@@ -635,8 +669,8 @@ v9fs_vfs_create(struct mnt_idmap *idmap, struct inode *dir,
  *
  */
 
-static int v9fs_vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
-			  struct dentry *dentry, umode_t mode)
+static struct dentry *v9fs_vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+				     struct dentry *dentry, umode_t mode)
 {
 	int err;
 	u32 perm;
@@ -658,8 +692,7 @@ static int v9fs_vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 
 	if (fid)
 		p9_fid_put(fid);
-
-	return err;
+	return ERR_PTR(err);
 }
 
 /**
@@ -703,8 +736,10 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 		inode = NULL;
 	else if (IS_ERR(fid))
 		inode = ERR_CAST(fid);
+	else if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
+		inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
 	else
-		inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb, false);
+		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 	/*
 	 * If we had a rename on the server and a parallel lookup
 	 * for the new name, then make sure we instantiate with
@@ -733,44 +768,40 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
 	struct v9fs_inode __maybe_unused *v9inode;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid;
-	struct dentry *res = NULL;
 	struct inode *inode;
 	int p9_omode;
 
 	if (d_in_lookup(dentry)) {
-		res = v9fs_vfs_lookup(dir, dentry, 0);
-		if (IS_ERR(res))
-			return PTR_ERR(res);
-
-		if (res)
-			dentry = res;
+		struct dentry *res = v9fs_vfs_lookup(dir, dentry, 0);
+		if (res || d_really_is_positive(dentry))
+			return finish_no_open(file, res);
 	}
 
 	/* Only creates */
-	if (!(flags & O_CREAT) || d_really_is_positive(dentry))
-		return finish_no_open(file, res);
+	if (!(flags & O_CREAT))
+		return finish_no_open(file, NULL);
 
 	v9ses = v9fs_inode2v9ses(dir);
 	perm = unixmode2p9mode(v9ses, mode);
 	p9_omode = v9fs_uflags2omode(flags, v9fs_proto_dotu(v9ses));
 
 	if ((v9ses->cache & CACHE_WRITEBACK) && (p9_omode & P9_OWRITE)) {
-		p9_omode = (p9_omode & ~P9_OWRITE) | P9_ORDWR;
+		p9_omode = (p9_omode & ~(P9_OWRITE | P9_OAPPEND)) | P9_ORDWR;
 		p9_debug(P9_DEBUG_CACHE,
 			"write-only file with writeback enabled, creating w/ O_RDWR\n");
 	}
 	fid = v9fs_create(v9ses, dir, dentry, NULL, perm, p9_omode);
-	if (IS_ERR(fid)) {
-		err = PTR_ERR(fid);
-		goto error;
-	}
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
 
 	v9fs_invalidate_inode_attr(dir);
 	inode = d_inode(dentry);
 	v9inode = V9FS_I(inode);
 	err = finish_open(file, dentry, generic_file_open);
-	if (err)
-		goto error;
+	if (unlikely(err)) {
+		p9_fid_put(fid);
+		return err;
+	}
 
 	file->private_data = fid;
 #ifdef CONFIG_9P_FSCACHE
@@ -783,13 +814,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
 	v9fs_open_fid_add(inode, &fid);
 
 	file->f_mode |= FMODE_CREATED;
-out:
-	dput(res);
-	return err;
-
-error:
-	p9_fid_put(fid);
-	goto out;
+	return 0;
 }
 
 /**
@@ -1368,4 +1393,3 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
 	.getattr = v9fs_vfs_getattr,
 	.setattr = v9fs_vfs_setattr,
 };
-
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index c61b97bd13b9..643e759eacb2 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -52,50 +52,80 @@ static kgid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
 	return current_fsgid();
 }
 
+static int v9fs_test_inode_dotl(struct inode *inode, void *data)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	struct p9_stat_dotl *st = (struct p9_stat_dotl *)data;
 
+	/* don't match inode of different type */
+	if (inode_wrong_type(inode, st->st_mode))
+		return 0;
 
-struct inode *
-v9fs_fid_iget_dotl(struct super_block *sb, struct p9_fid *fid, bool new)
+	if (inode->i_generation != st->st_gen)
+		return 0;
+
+	/* compare qid details */
+	if (memcmp(&v9inode->qid.version,
+		   &st->qid.version, sizeof(v9inode->qid.version)))
+		return 0;
+
+	if (v9inode->qid.type != st->qid.type)
+		return 0;
+
+	if (v9inode->qid.path != st->qid.path)
+		return 0;
+	return 1;
+}
+
+/* Always get a new inode */
+static int v9fs_test_new_inode_dotl(struct inode *inode, void *data)
+{
+	return 0;
+}
+
+static int v9fs_set_inode_dotl(struct inode *inode,  void *data)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	struct p9_stat_dotl *st = (struct p9_stat_dotl *)data;
+
+	memcpy(&v9inode->qid, &st->qid, sizeof(st->qid));
+	inode->i_generation = st->st_gen;
+	return 0;
+}
+
+static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
+					struct p9_qid *qid,
+					struct p9_fid *fid,
+					struct p9_stat_dotl *st,
+					int new)
 {
 	int retval;
 	struct inode *inode;
-	struct p9_stat_dotl *st;
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
+	int (*test)(struct inode *inode, void *data);
 
-	inode = iget_locked(sb, QID2INO(&fid->qid));
-	if (unlikely(!inode))
-		return ERR_PTR(-ENOMEM);
-	if (!(inode->i_state & I_NEW)) {
-		if (!new) {
-			goto done;
-		} else { /* deal with race condition in inode number reuse */
-			p9_debug(P9_DEBUG_ERROR, "WARNING: Inode collision %lx\n",
-						inode->i_ino);
-			iput(inode);
-			remove_inode_hash(inode);
-			inode = iget_locked(sb, QID2INO(&fid->qid));
-			WARN_ON(!(inode->i_state & I_NEW));
-		}
-	}
+	if (new)
+		test = v9fs_test_new_inode_dotl;
+	else
+		test = v9fs_test_inode_dotl;
 
+	inode = iget5_locked(sb, QID2INO(qid), test, v9fs_set_inode_dotl, st);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode_state_read_once(inode) & I_NEW))
+		return inode;
 	/*
 	 * initialize the inode with the stat info
 	 * FIXME!! we may need support for stale inodes
 	 * later.
 	 */
-	st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN);
-	if (IS_ERR(st)) {
-		retval = PTR_ERR(st);
-		goto error;
-	}
-
-	retval = v9fs_init_inode(v9ses, inode, &fid->qid,
+	inode->i_ino = QID2INO(qid);
+	retval = v9fs_init_inode(v9ses, inode,
 				 st->st_mode, new_decode_dev(st->st_rdev));
-	v9fs_stat2inode_dotl(st, inode, 0);
-	kfree(st);
 	if (retval)
 		goto error;
 
+	v9fs_stat2inode_dotl(st, inode, 0);
 	v9fs_set_netfs_context(inode);
 	v9fs_cache_inode_get_cookie(inode);
 	retval = v9fs_get_acl(inode, fid);
@@ -103,11 +133,27 @@ v9fs_fid_iget_dotl(struct super_block *sb, struct p9_fid *fid, bool new)
 		goto error;
 
 	unlock_new_inode(inode);
-done:
 	return inode;
 error:
 	iget_failed(inode);
 	return ERR_PTR(retval);
+
+}
+
+struct inode *
+v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+			 struct super_block *sb, int new)
+{
+	struct p9_stat_dotl *st;
+	struct inode *inode = NULL;
+
+	st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN);
+	if (IS_ERR(st))
+		return ERR_CAST(st);
+
+	inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st, new);
+	kfree(st);
+	return inode;
 }
 
 struct dotl_openflag_map {
@@ -192,20 +238,16 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
 	struct p9_fid *dfid = NULL, *ofid = NULL;
 	struct v9fs_session_info *v9ses;
 	struct posix_acl *pacl = NULL, *dacl = NULL;
-	struct dentry *res = NULL;
 
 	if (d_in_lookup(dentry)) {
-		res = v9fs_vfs_lookup(dir, dentry, 0);
-		if (IS_ERR(res))
-			return PTR_ERR(res);
-
-		if (res)
-			dentry = res;
+		struct dentry *res = v9fs_vfs_lookup(dir, dentry, 0);
+		if (res || d_really_is_positive(dentry))
+			return	finish_no_open(file, res);
 	}
 
 	/* Only creates */
-	if (!(flags & O_CREAT) || d_really_is_positive(dentry))
-		return	finish_no_open(file, res);
+	if (!(flags & O_CREAT))
+		return	finish_no_open(file, NULL);
 
 	v9ses = v9fs_inode2v9ses(dir);
 
@@ -240,7 +282,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
 	}
 
 	if ((v9ses->cache & CACHE_WRITEBACK) && (p9_omode & P9_OWRITE)) {
-		p9_omode = (p9_omode & ~P9_OWRITE) | P9_ORDWR;
+		p9_omode = (p9_omode & ~(P9_OWRITE | P9_DOTL_APPEND)) | P9_ORDWR;
 		p9_debug(P9_DEBUG_CACHE,
 			"write-only file with writeback enabled, creating w/ O_RDWR\n");
 	}
@@ -259,7 +301,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
 		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
 		goto out;
 	}
-	inode = v9fs_fid_iget_dotl(dir->i_sb, fid, true);
+	inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err);
@@ -291,7 +333,6 @@ out:
 	p9_fid_put(ofid);
 	p9_fid_put(fid);
 	v9fs_put_acl(dacl, pacl);
-	dput(res);
 	return err;
 }
 
@@ -304,11 +345,12 @@ out:
  *
  */
 
-static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
-			       struct inode *dir, struct dentry *dentry,
-			       umode_t omode)
+static struct dentry *v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
+					  struct inode *dir, struct dentry *dentry,
+					  umode_t omode)
 {
 	int err;
+	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid = NULL, *dfid = NULL;
 	kgid_t gid;
 	const unsigned char *name;
@@ -318,6 +360,7 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
 	struct posix_acl *dacl = NULL, *pacl = NULL;
 
 	p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry);
+	v9ses = v9fs_inode2v9ses(dir);
 
 	omode |= S_IFDIR;
 	if (dir->i_mode & S_ISGID)
@@ -352,15 +395,15 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
 	}
 
 	/* instantiate inode and assign the unopened fid to the dentry */
-	inode = v9fs_fid_iget_dotl(dir->i_sb, fid, true);
+	inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
 			 err);
 		goto error;
 	}
-	v9fs_fid_add(dentry, &fid);
 	v9fs_set_create_acl(inode, fid, dacl, pacl);
+	v9fs_fid_add(dentry, &fid);
 	d_instantiate(dentry, inode);
 	err = 0;
 	inc_nlink(dir);
@@ -369,7 +412,7 @@ error:
 	p9_fid_put(fid);
 	v9fs_put_acl(dacl, pacl);
 	p9_fid_put(dfid);
-	return err;
+	return ERR_PTR(err);
 }
 
 static int
@@ -749,6 +792,7 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir,
 	kgid_t gid;
 	const unsigned char *name;
 	umode_t mode;
+	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid = NULL, *dfid = NULL;
 	struct inode *inode;
 	struct p9_qid qid;
@@ -758,6 +802,7 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir,
 		 dir->i_ino, dentry, omode,
 		 MAJOR(rdev), MINOR(rdev));
 
+	v9ses = v9fs_inode2v9ses(dir);
 	dfid = v9fs_parent_fid(dentry);
 	if (IS_ERR(dfid)) {
 		err = PTR_ERR(dfid);
@@ -788,7 +833,7 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir,
 			 err);
 		goto error;
 	}
-	inode = v9fs_fid_iget_dotl(dir->i_sb, fid, true);
+	inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index f52fdf42945c..315336de6f02 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -19,6 +19,7 @@
 #include <linux/statfs.h>
 #include <linux/magic.h>
 #include <linux/fscache.h>
+#include <linux/fs_context.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
@@ -30,32 +31,10 @@
 
 static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
 
-/**
- * v9fs_set_super - set the superblock
- * @s: super block
- * @data: file system specific data
- *
- */
-
-static int v9fs_set_super(struct super_block *s, void *data)
-{
-	s->s_fs_info = data;
-	return set_anon_super(s, data);
-}
-
-/**
- * v9fs_fill_super - populate superblock with info
- * @sb: superblock
- * @v9ses: session information
- * @flags: flags propagated from v9fs_mount()
- *
- */
-
-static int
-v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
-		int flags)
+static int v9fs_fill_super(struct super_block *sb)
 {
 	int ret;
+	struct v9fs_session_info *v9ses = v9ses = sb->s_fs_info;
 
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
@@ -95,16 +74,12 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 }
 
 /**
- * v9fs_mount - mount a superblock
- * @fs_type: file system type
- * @flags: mount flags
- * @dev_name: device name that was mounted
- * @data: mount options
+ * v9fs_get_tree - create the mountable root and superblock
+ * @fc: the filesystem context
  *
  */
 
-static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
-		       const char *dev_name, void *data)
+static int v9fs_get_tree(struct fs_context *fc)
 {
 	struct super_block *sb = NULL;
 	struct inode *inode = NULL;
@@ -117,29 +92,32 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 
 	v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
 	if (!v9ses)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
-	fid = v9fs_session_init(v9ses, dev_name, data);
+	fid = v9fs_session_init(v9ses, fc);
 	if (IS_ERR(fid)) {
 		retval = PTR_ERR(fid);
 		goto free_session;
 	}
 
-	sb = sget(fs_type, NULL, v9fs_set_super, flags, v9ses);
+	fc->s_fs_info = v9ses;
+	sb = sget_fc(fc, NULL, set_anon_super_fc);
 	if (IS_ERR(sb)) {
 		retval = PTR_ERR(sb);
 		goto clunk_fid;
 	}
-	retval = v9fs_fill_super(sb, v9ses, flags);
+	retval = v9fs_fill_super(sb);
 	if (retval)
 		goto release_sb;
 
-	if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
-		sb->s_d_op = &v9fs_cached_dentry_operations;
-	else
-		sb->s_d_op = &v9fs_dentry_operations;
+	if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
+		set_default_d_op(sb, &v9fs_cached_dentry_operations);
+	} else {
+		set_default_d_op(sb, &v9fs_dentry_operations);
+		sb->s_d_flags |= DCACHE_DONTCACHE;
+	}
 
-	inode = v9fs_get_inode_from_fid(v9ses, fid, sb, true);
+	inode = v9fs_get_new_inode_from_fid(v9ses, fid, sb);
 	if (IS_ERR(inode)) {
 		retval = PTR_ERR(inode);
 		goto release_sb;
@@ -157,14 +135,15 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 	v9fs_fid_add(root, &fid);
 
 	p9_debug(P9_DEBUG_VFS, " simple set mount, return 0\n");
-	return dget(sb->s_root);
+	fc->root = dget(sb->s_root);
+	return 0;
 
 clunk_fid:
 	p9_fid_put(fid);
 	v9fs_session_close(v9ses);
 free_session:
 	kfree(v9ses);
-	return ERR_PTR(retval);
+	return retval;
 
 release_sb:
 	/*
@@ -175,7 +154,7 @@ release_sb:
 	 */
 	p9_fid_put(fid);
 	deactivate_locked_super(sb);
-	return ERR_PTR(retval);
+	return retval;
 }
 
 /**
@@ -250,7 +229,7 @@ static int v9fs_drop_inode(struct inode *inode)
 
 	v9ses = v9fs_inode2v9ses(inode);
 	if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
-		return generic_drop_inode(inode);
+		return inode_generic_drop(inode);
 	/*
 	 * in case of non cached mode always drop the
 	 * inode because we want the inode attribute
@@ -301,11 +280,86 @@ static const struct super_operations v9fs_super_ops_dotl = {
 	.write_inode = v9fs_write_inode_dotl,
 };
 
+static void v9fs_free_fc(struct fs_context *fc)
+{
+	struct v9fs_context *ctx = fc->fs_private;
+
+	if (!ctx)
+		return;
+
+	/* These should be NULL by now but guard against leaks */
+	kfree(ctx->session_opts.uname);
+	kfree(ctx->session_opts.aname);
+#ifdef CONFIG_9P_FSCACHE
+	kfree(ctx->session_opts.cachetag);
+#endif
+	if (ctx->client_opts.trans_mod)
+		v9fs_put_trans(ctx->client_opts.trans_mod);
+	kfree(ctx);
+}
+
+static const struct fs_context_operations v9fs_context_ops = {
+	.parse_param	= v9fs_parse_param,
+	.get_tree	= v9fs_get_tree,
+	.free		= v9fs_free_fc,
+};
+
+static int v9fs_init_fs_context(struct fs_context *fc)
+{
+	struct v9fs_context	*ctx;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	/* initialize core options */
+	ctx->session_opts.afid = ~0;
+	ctx->session_opts.cache = CACHE_NONE;
+	ctx->session_opts.session_lock_timeout = P9_LOCK_TIMEOUT;
+	ctx->session_opts.uname = kstrdup(V9FS_DEFUSER, GFP_KERNEL);
+	if (!ctx->session_opts.uname)
+		goto error;
+
+	ctx->session_opts.aname = kstrdup(V9FS_DEFANAME, GFP_KERNEL);
+	if (!ctx->session_opts.aname)
+		goto error;
+
+	ctx->session_opts.uid = INVALID_UID;
+	ctx->session_opts.dfltuid = V9FS_DEFUID;
+	ctx->session_opts.dfltgid = V9FS_DEFGID;
+
+	/* initialize client options */
+	ctx->client_opts.proto_version = p9_proto_2000L;
+	ctx->client_opts.msize = DEFAULT_MSIZE;
+
+	/* initialize fd transport options */
+	ctx->fd_opts.port = P9_FD_PORT;
+	ctx->fd_opts.rfd = ~0;
+	ctx->fd_opts.wfd = ~0;
+	ctx->fd_opts.privport = false;
+
+	/* initialize rdma transport options */
+	ctx->rdma_opts.port = P9_RDMA_PORT;
+	ctx->rdma_opts.sq_depth = P9_RDMA_SQ_DEPTH;
+	ctx->rdma_opts.rq_depth = P9_RDMA_RQ_DEPTH;
+	ctx->rdma_opts.timeout = P9_RDMA_TIMEOUT;
+	ctx->rdma_opts.privport = false;
+
+	fc->ops = &v9fs_context_ops;
+	fc->fs_private = ctx;
+
+	return 0;
+error:
+	fc->need_free = 1;
+	return -ENOMEM;
+}
+
 struct file_system_type v9fs_fs_type = {
 	.name = "9p",
-	.mount = v9fs_mount,
 	.kill_sb = v9fs_kill_super,
 	.owner = THIS_MODULE,
 	.fs_flags = FS_RENAME_DOES_D_MOVE,
+	.init_fs_context = v9fs_init_fs_context,
+	.parameters = v9fs_param_spec,
 };
 MODULE_ALIAS_FS("9p");
diff --git a/fs/Kconfig b/fs/Kconfig
index a46b0cbc4d8f..0bfdaecaa877 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -43,7 +43,6 @@ config FS_MBCACHE
 	default y if EXT4_FS=y
 	default m if EXT2_FS_XATTR || EXT4_FS
 
-source "fs/reiserfs/Kconfig"
 source "fs/jfs/Kconfig"
 
 source "fs/xfs/Kconfig"
@@ -52,7 +51,6 @@ source "fs/ocfs2/Kconfig"
 source "fs/btrfs/Kconfig"
 source "fs/nilfs2/Kconfig"
 source "fs/f2fs/Kconfig"
-source "fs/bcachefs/Kconfig"
 source "fs/zonefs/Kconfig"
 
 endif # BLOCK
@@ -60,7 +58,7 @@ endif # BLOCK
 config FS_DAX
 	bool "File system based Direct Access (DAX) support"
 	depends on MMU
-	depends on ZONE_DEVICE || FS_DAX_LIMITED
+	depends on ZONE_DEVICE
 	select FS_IOMAP
 	select DAX
 	help
@@ -96,13 +94,6 @@ config FS_DAX_PMD
 	depends on ZONE_DEVICE
 	depends on TRANSPARENT_HUGEPAGE
 
-# Selected by DAX drivers that do not expect filesystem DAX to support
-# get_user_pages() of DAX mappings. I.e. "limited" indicates no support
-# for fork() of processes with MAP_SHARED mappings or support for
-# direct-I/O to a DAX mapping.
-config FS_DAX_LIMITED
-	bool
-
 # Posix ACL utility routines
 #
 # Note: Posix ACLs can be implemented without these helpers.  Never use
@@ -257,8 +248,7 @@ config ARCH_SUPPORTS_HUGETLBFS
 
 menuconfig HUGETLBFS
 	bool "HugeTLB file system support"
-	depends on X86 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
-	depends on (SYSFS || SYSCTL)
+	depends on ARCH_SUPPORTS_HUGETLBFS
 	select MEMFD_CREATE
 	select PADATA if SMP
 	help
@@ -287,6 +277,11 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	def_bool HUGETLB_PAGE
 	depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
 	depends on SPARSEMEM_VMEMMAP
+	select SPARSEMEM_VMEMMAP_PREINIT if ARCH_WANT_HUGETLB_VMEMMAP_PREINIT
+
+config HUGETLB_PMD_PAGE_TABLE_SHARING
+	def_bool HUGETLB_PAGE
+	depends on ARCH_WANT_HUGE_PMD_SHARE && SPLIT_PMD_PTLOCKS
 
 config ARCH_HAS_GIGANTIC_PAGE
 	bool
@@ -331,9 +326,9 @@ source "fs/omfs/Kconfig"
 source "fs/hpfs/Kconfig"
 source "fs/qnx4/Kconfig"
 source "fs/qnx6/Kconfig"
+source "fs/resctrl/Kconfig"
 source "fs/romfs/Kconfig"
 source "fs/pstore/Kconfig"
-source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/erofs/Kconfig"
 source "fs/vboxsf/Kconfig"
@@ -365,6 +360,7 @@ config GRACE_PERIOD
 config LOCKD
 	tristate
 	depends on FILE_LOCKING
+	select CRC32
 	select GRACE_PERIOD
 
 config LOCKD_V4
@@ -382,6 +378,29 @@ config NFS_COMMON
 	depends on NFSD || NFS_FS || LOCKD
 	default y
 
+config NFS_COMMON_LOCALIO_SUPPORT
+	tristate
+	depends on NFS_LOCALIO
+	default y if NFSD=y || NFS_FS=y
+	default m if NFSD=m && NFS_FS=m
+	select SUNRPC
+
+config NFS_LOCALIO
+	bool "NFS client and server support for LOCALIO auxiliary protocol"
+	depends on NFSD && NFS_FS
+	select NFS_COMMON_LOCALIO_SUPPORT
+	default n
+	help
+	  Some NFS servers support an auxiliary NFS LOCALIO protocol
+	  that is not an official part of the NFS protocol.
+
+	  This option enables support for the LOCALIO protocol in the
+	  kernel's NFS server and client. Enable this to permit local
+	  NFS clients to bypass the network when issuing reads and
+	  writes to the local NFS server.
+
+	  If unsure, say N.
+
 config NFS_V4_2_SSC_HELPER
 	bool
 	default y if NFS_V4_2
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index f5693164ca9a..1949e25c7741 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -176,4 +176,21 @@ config COREDUMP
 	  certainly want to say Y here. Not necessary on systems that never
 	  need debugging or only ever run flawless code.
 
+config EXEC_KUNIT_TEST
+	bool "Build execve tests" if !KUNIT_ALL_TESTS
+	depends on KUNIT=y
+	default KUNIT_ALL_TESTS
+	help
+	  This builds the exec KUnit tests, which tests boundary conditions
+	  of various aspects of the exec internals.
+
+config ARCH_HAS_ELF_CORE_EFLAGS
+	bool
+	depends on BINFMT_ELF && ELF_CORE
+	default n
+	help
+	  Select this option if the architecture makes use of the e_flags
+	  field in the ELF header to store ABI or other architecture-specific
+	  information that should be preserved in core dumps.
+
 endmenu
diff --git a/fs/Makefile b/fs/Makefile
index 6ecc9b0a53f2..a04274a3c854 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -14,8 +14,9 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o splice.o sync.o utimes.o d_path.o \
 		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
-		fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
-		kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o
+		fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \
+		kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \
+		file_attr.o
 
 obj-$(CONFIG_BUFFER_HEAD)	+= buffer.o mpage.o
 obj-$(CONFIG_PROC_FS)		+= proc_namespace.o
@@ -61,7 +62,6 @@ obj-$(CONFIG_DLM)		+= dlm/
  
 # Do not add any filesystems before this line
 obj-$(CONFIG_NETFS_SUPPORT)	+= netfs/
-obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
 obj-$(CONFIG_EXT4_FS)		+= ext4/
 # We place ext4 before ext2 so that clean ext3 root fs's do NOT mount using the
 # ext2 driver, which doesn't know about journalling!  Explicitly request ext2
@@ -88,7 +88,6 @@ obj-$(CONFIG_NFSD)		+= nfsd/
 obj-$(CONFIG_LOCKD)		+= lockd/
 obj-$(CONFIG_NLS)		+= nls/
 obj-y				+= unicode/
-obj-$(CONFIG_SYSV_FS)		+= sysv/
 obj-$(CONFIG_SMBFS)		+= smb/
 obj-$(CONFIG_HPFS_FS)		+= hpfs/
 obj-$(CONFIG_NTFS3_FS)		+= ntfs3/
@@ -122,10 +121,11 @@ obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
 obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
 obj-$(CONFIG_F2FS_FS)		+= f2fs/
-obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs/
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/
 obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
 obj-$(CONFIG_EROFS_FS)		+= erofs/
 obj-$(CONFIG_VBOXSF_FS)		+= vboxsf/
 obj-$(CONFIG_ZONEFS_FS)		+= zonefs/
+obj-$(CONFIG_BPF_LSM)		+= bpf_fs_kfuncs.o
+obj-$(CONFIG_RESCTRL_FS)	+= resctrl/
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index ee80718aaeec..cd13165fd904 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -25,7 +25,7 @@
 const struct file_operations adfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read_iter	= generic_file_read_iter,
-	.mmap		= generic_file_mmap,
+	.mmap_prepare	= generic_file_mmap_prepare,
 	.fsync		= generic_file_fsync,
 	.write_iter	= generic_file_write_iter,
 	.splice_read	= filemap_splice_read,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index a183e213a4a5..6830f8bc8d4e 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -53,14 +53,14 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to)
 		truncate_pagecache(inode, inode->i_size);
 }
 
-static int adfs_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len,
-			struct page **pagep, void **fsdata)
+static int adfs_write_begin(const struct kiocb *iocb,
+			    struct address_space *mapping,
+			    loff_t pos, unsigned len,
+			    struct folio **foliop, void **fsdata)
 {
 	int ret;
 
-	*pagep = NULL;
-	ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata,
+	ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata,
 				adfs_get_block,
 				&ADFS_I(mapping->host)->mmu_private);
 	if (unlikely(ret))
diff --git a/fs/adfs/map.c b/fs/adfs/map.c
index a81de80c45c1..a0ce272b4098 100644
--- a/fs/adfs/map.c
+++ b/fs/adfs/map.c
@@ -6,7 +6,7 @@
  */
 #include <linux/slab.h>
 #include <linux/statfs.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
 #include "adfs.h"
 
 /*
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 9354b14bbfe3..fdccdbbfc213 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -6,7 +6,8 @@
  */
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
+#include <linux/fs_context.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
@@ -115,87 +116,61 @@ static int adfs_show_options(struct seq_file *seq, struct dentry *root)
 	return 0;
 }
 
-enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_ftsuffix, Opt_err};
+enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_ftsuffix};
 
-static const match_table_t tokens = {
-	{Opt_uid, "uid=%u"},
-	{Opt_gid, "gid=%u"},
-	{Opt_ownmask, "ownmask=%o"},
-	{Opt_othmask, "othmask=%o"},
-	{Opt_ftsuffix, "ftsuffix=%u"},
-	{Opt_err, NULL}
+static const struct fs_parameter_spec adfs_param_spec[] = {
+	fsparam_uid	("uid",		Opt_uid),
+	fsparam_gid	("gid",		Opt_gid),
+	fsparam_u32oct	("ownmask",	Opt_ownmask),
+	fsparam_u32oct	("othmask",	Opt_othmask),
+	fsparam_u32	("ftsuffix",	Opt_ftsuffix),
+	{}
 };
 
-static int parse_options(struct super_block *sb, struct adfs_sb_info *asb,
-			 char *options)
+static int adfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
-	char *p;
-	int option;
-
-	if (!options)
-		return 0;
-
-	while ((p = strsep(&options, ",")) != NULL) {
-		substring_t args[MAX_OPT_ARGS];
-		int token;
-		if (!*p)
-			continue;
-
-		token = match_token(p, tokens, args);
-		switch (token) {
-		case Opt_uid:
-			if (match_int(args, &option))
-				return -EINVAL;
-			asb->s_uid = make_kuid(current_user_ns(), option);
-			if (!uid_valid(asb->s_uid))
-				return -EINVAL;
-			break;
-		case Opt_gid:
-			if (match_int(args, &option))
-				return -EINVAL;
-			asb->s_gid = make_kgid(current_user_ns(), option);
-			if (!gid_valid(asb->s_gid))
-				return -EINVAL;
-			break;
-		case Opt_ownmask:
-			if (match_octal(args, &option))
-				return -EINVAL;
-			asb->s_owner_mask = option;
-			break;
-		case Opt_othmask:
-			if (match_octal(args, &option))
-				return -EINVAL;
-			asb->s_other_mask = option;
-			break;
-		case Opt_ftsuffix:
-			if (match_int(args, &option))
-				return -EINVAL;
-			asb->s_ftsuffix = option;
-			break;
-		default:
-			adfs_msg(sb, KERN_ERR,
-				 "unrecognised mount option \"%s\" or missing value",
-				 p);
-			return -EINVAL;
-		}
+	struct adfs_sb_info *asb = fc->s_fs_info;
+	struct fs_parse_result result;
+	int opt;
+
+	opt = fs_parse(fc, adfs_param_spec, param, &result);
+	if (opt < 0)
+		return opt;
+
+	switch (opt) {
+	case Opt_uid:
+		asb->s_uid = result.uid;
+		break;
+	case Opt_gid:
+		asb->s_gid = result.gid;
+		break;
+	case Opt_ownmask:
+		asb->s_owner_mask = result.uint_32;
+		break;
+	case Opt_othmask:
+		asb->s_other_mask = result.uint_32;
+		break;
+	case Opt_ftsuffix:
+		asb->s_ftsuffix = result.uint_32;
+		break;
+	default:
+		return -EINVAL;
 	}
 	return 0;
 }
 
-static int adfs_remount(struct super_block *sb, int *flags, char *data)
+static int adfs_reconfigure(struct fs_context *fc)
 {
-	struct adfs_sb_info temp_asb;
-	int ret;
+	struct adfs_sb_info *new_asb = fc->s_fs_info;
+	struct adfs_sb_info *asb = ADFS_SB(fc->root->d_sb);
 
-	sync_filesystem(sb);
-	*flags |= ADFS_SB_FLAGS;
+	sync_filesystem(fc->root->d_sb);
+	fc->sb_flags |= ADFS_SB_FLAGS;
 
-	temp_asb = *ADFS_SB(sb);
-	ret = parse_options(sb, &temp_asb, data);
-	if (ret == 0)
-		*ADFS_SB(sb) = temp_asb;
+	/* Structure copy newly parsed options */
+	*asb = *new_asb;
 
-	return ret;
+	return 0;
 }
 
 static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -273,7 +248,6 @@ static const struct super_operations adfs_sops = {
 	.write_inode	= adfs_write_inode,
 	.put_super	= adfs_put_super,
 	.statfs		= adfs_statfs,
-	.remount_fs	= adfs_remount,
 	.show_options	= adfs_show_options,
 };
 
@@ -361,34 +335,21 @@ static int adfs_validate_dr0(struct super_block *sb, struct buffer_head *bh,
 	return 0;
 }
 
-static int adfs_fill_super(struct super_block *sb, void *data, int silent)
+static int adfs_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	struct adfs_discrecord *dr;
 	struct object_info root_obj;
-	struct adfs_sb_info *asb;
+	struct adfs_sb_info *asb = sb->s_fs_info;
 	struct inode *root;
 	int ret = -EINVAL;
+	int silent = fc->sb_flags & SB_SILENT;
 
 	sb->s_flags |= ADFS_SB_FLAGS;
 
-	asb = kzalloc(sizeof(*asb), GFP_KERNEL);
-	if (!asb)
-		return -ENOMEM;
-
 	sb->s_fs_info = asb;
 	sb->s_magic = ADFS_SUPER_MAGIC;
 	sb->s_time_gran = 10000000;
 
-	/* set default options */
-	asb->s_uid = GLOBAL_ROOT_UID;
-	asb->s_gid = GLOBAL_ROOT_GID;
-	asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK;
-	asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK;
-	asb->s_ftsuffix = 0;
-
-	if (parse_options(sb, asb, data))
-		goto error;
-
 	/* Try to probe the filesystem boot block */
 	ret = adfs_probe(sb, ADFS_DISCRECORD, 1, adfs_validate_bblk);
 	if (ret == -EILSEQ)
@@ -436,7 +397,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
 	if (asb->s_ftsuffix)
 		asb->s_namelen += 4;
 
-	sb->s_d_op = &adfs_dentry_operations;
+	set_default_d_op(sb, &adfs_dentry_operations);
 	root = adfs_iget(sb, &root_obj);
 	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
@@ -453,18 +414,61 @@ error:
 	return ret;
 }
 
-static struct dentry *adfs_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int adfs_get_tree(struct fs_context *fc)
+{
+	return get_tree_bdev(fc, adfs_fill_super);
+}
+
+static void adfs_free_fc(struct fs_context *fc)
 {
-	return mount_bdev(fs_type, flags, dev_name, data, adfs_fill_super);
+	struct adfs_context *asb = fc->s_fs_info;
+
+	kfree(asb);
+}
+
+static const struct fs_context_operations adfs_context_ops = {
+	.parse_param	= adfs_parse_param,
+	.get_tree	= adfs_get_tree,
+	.reconfigure	= adfs_reconfigure,
+	.free		= adfs_free_fc,
+};
+
+static int adfs_init_fs_context(struct fs_context *fc)
+{
+	struct adfs_sb_info *asb;
+
+	asb = kzalloc(sizeof(struct adfs_sb_info), GFP_KERNEL);
+	if (!asb)
+		return -ENOMEM;
+
+	if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+		struct super_block *sb = fc->root->d_sb;
+		struct adfs_sb_info *old_asb = ADFS_SB(sb);
+
+		/* structure copy existing options before parsing */
+		*asb = *old_asb;
+	} else {
+		/* set default options */
+		asb->s_uid = GLOBAL_ROOT_UID;
+		asb->s_gid = GLOBAL_ROOT_GID;
+		asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK;
+		asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK;
+		asb->s_ftsuffix = 0;
+	}
+
+	fc->ops = &adfs_context_ops;
+	fc->s_fs_info = asb;
+
+	return 0;
 }
 
 static struct file_system_type adfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "adfs",
-	.mount		= adfs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
+	.init_fs_context = adfs_init_fs_context,
+	.parameters	= adfs_param_spec,
 };
 MODULE_ALIAS_FS("adfs");
 
@@ -491,4 +495,5 @@ static void __exit exit_adfs_fs(void)
 
 module_init(init_adfs_fs)
 module_exit(exit_adfs_fs)
+MODULE_DESCRIPTION("Acorn Disc Filing System");
 MODULE_LICENSE("GPL");
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 2e612834329a..ac4e9a02910b 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -14,8 +14,6 @@
 
 /* Ugly macros make the code more pretty. */
 
-#define GET_END_PTR(st,p,sz)		 ((st *)((char *)(p)+((sz)-sizeof(st))))
-#define AFFS_GET_HASHENTRY(data,hashkey) be32_to_cpu(((struct dir_front *)data)->hashtable[hashkey])
 #define AFFS_BLOCK(sb, bh, blk)		(AFFS_HEAD(bh)->table[AFFS_SB(sb)->s_hashsize-1-(blk)])
 
 #define AFFS_HEAD(bh)		((struct affs_head *)(bh)->b_data)
@@ -170,7 +168,7 @@ extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsi
 extern int	affs_unlink(struct inode *dir, struct dentry *dentry);
 extern int	affs_create(struct mnt_idmap *idmap, struct inode *dir,
 			struct dentry *dentry, umode_t mode, bool);
-extern int	affs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+extern struct dentry *affs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 			struct dentry *dentry, umode_t mode);
 extern int	affs_rmdir(struct inode *dir, struct dentry *dentry);
 extern int	affs_link(struct dentry *olddentry, struct inode *dir,
diff --git a/fs/affs/amigaffs.h b/fs/affs/amigaffs.h
index 81fb396d4dfa..da3217ab6adb 100644
--- a/fs/affs/amigaffs.h
+++ b/fs/affs/amigaffs.h
@@ -49,12 +49,13 @@ struct affs_short_date {
 
 struct affs_root_head {
 	__be32 ptype;
+	/* The following fields are not used, but kept as documentation. */
 	__be32 spare1;
 	__be32 spare2;
 	__be32 hash_size;
 	__be32 spare3;
 	__be32 checksum;
-	__be32 hashtable[1];
+	__be32 hashtable[];
 };
 
 struct affs_root_tail {
@@ -80,7 +81,7 @@ struct affs_head {
 	__be32 spare1;
 	__be32 first_data;
 	__be32 checksum;
-	__be32 table[1];
+	__be32 table[];
 };
 
 struct affs_tail {
@@ -108,7 +109,7 @@ struct slink_front
 	__be32 key;
 	__be32 spare1[3];
 	__be32 checksum;
-	u8 symname[1];	/* depends on block size */
+	u8 symname[];	/* depends on block size */
 };
 
 struct affs_data_head
@@ -119,7 +120,7 @@ struct affs_data_head
 	__be32 size;
 	__be32 next;
 	__be32 checksum;
-	u8 data[1];	/* depends on block size */
+	u8 data[];	/* depends on block size */
 };
 
 /* Permission bits */
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index b2bf7016e1b3..bd40d5f08810 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -17,13 +17,44 @@
 #include <linux/iversion.h>
 #include "affs.h"
 
+struct affs_dir_data {
+	unsigned long ino;
+	u64 cookie;
+};
+
 static int affs_readdir(struct file *, struct dir_context *);
 
+static loff_t affs_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct affs_dir_data *data = file->private_data;
+
+	return generic_llseek_cookie(file, offset, whence, &data->cookie);
+}
+
+static int affs_dir_open(struct inode *inode, struct file *file)
+{
+	struct affs_dir_data	*data;
+
+	data = kzalloc(sizeof(struct affs_dir_data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+	file->private_data = data;
+	return 0;
+}
+
+static int affs_dir_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
 const struct file_operations affs_dir_operations = {
+	.open		= affs_dir_open,
 	.read		= generic_read_dir,
-	.llseek		= generic_file_llseek,
+	.llseek		= affs_dir_llseek,
 	.iterate_shared	= affs_readdir,
 	.fsync		= affs_file_fsync,
+	.release	= affs_dir_release,
 };
 
 /*
@@ -45,6 +76,7 @@ static int
 affs_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct inode		*inode = file_inode(file);
+	struct affs_dir_data	*data = file->private_data;
 	struct super_block	*sb = inode->i_sb;
 	struct buffer_head	*dir_bh = NULL;
 	struct buffer_head	*fh_bh = NULL;
@@ -59,7 +91,7 @@ affs_readdir(struct file *file, struct dir_context *ctx)
 	pr_debug("%s(ino=%lu,f_pos=%llx)\n", __func__, inode->i_ino, ctx->pos);
 
 	if (ctx->pos < 2) {
-		file->private_data = (void *)0;
+		data->ino = 0;
 		if (!dir_emit_dots(file, ctx))
 			return 0;
 	}
@@ -80,8 +112,8 @@ affs_readdir(struct file *file, struct dir_context *ctx)
 	/* If the directory hasn't changed since the last call to readdir(),
 	 * we can jump directly to where we left off.
 	 */
-	ino = (u32)(long)file->private_data;
-	if (ino && inode_eq_iversion(inode, file->f_version)) {
+	ino = data->ino;
+	if (ino && inode_eq_iversion(inode, data->cookie)) {
 		pr_debug("readdir() left off=%d\n", ino);
 		goto inside;
 	}
@@ -131,8 +163,8 @@ inside:
 		} while (ino);
 	}
 done:
-	file->f_version = inode_query_iversion(inode);
-	file->private_data = (void *)(long)ino;
+	data->cookie = inode_query_iversion(inode);
+	data->ino = ino;
 	affs_brelse(fh_bh);
 
 out_brelse_dir:
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 04c018e19602..765c3443663e 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -415,14 +415,14 @@ affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	return ret;
 }
 
-static int affs_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len,
-			struct page **pagep, void **fsdata)
+static int affs_write_begin(const struct kiocb *iocb,
+			    struct address_space *mapping,
+			    loff_t pos, unsigned len,
+			    struct folio **foliop, void **fsdata)
 {
 	int ret;
 
-	*pagep = NULL;
-	ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata,
+	ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata,
 				affs_get_block,
 				&AFFS_I(mapping->host)->mmu_private);
 	if (unlikely(ret))
@@ -431,14 +431,15 @@ static int affs_write_begin(struct file *file, struct address_space *mapping,
 	return ret;
 }
 
-static int affs_write_end(struct file *file, struct address_space *mapping,
-			  loff_t pos, unsigned int len, unsigned int copied,
-			  struct page *page, void *fsdata)
+static int affs_write_end(const struct kiocb *iocb,
+			  struct address_space *mapping, loff_t pos,
+			  unsigned int len, unsigned int copied,
+			  struct folio *folio, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 	int ret;
 
-	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+	ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
 
 	/* Clear Archived bit on file writes, as AmigaOS would do */
 	if (AFFS_I(inode)->i_protect & FIBF_ARCHIVED) {
@@ -597,7 +598,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize)
 		BUG_ON(tmp > bsize);
 		AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA);
 		AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino);
-		AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx);
+		AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx + 1);
 		AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp);
 		affs_fix_checksum(sb, bh);
 		bh->b_state &= ~(1UL << BH_New);
@@ -646,9 +647,10 @@ static int affs_read_folio_ofs(struct file *file, struct folio *folio)
 	return err;
 }
 
-static int affs_write_begin_ofs(struct file *file, struct address_space *mapping,
+static int affs_write_begin_ofs(const struct kiocb *iocb,
+				struct address_space *mapping,
 				loff_t pos, unsigned len,
-				struct page **pagep, void **fsdata)
+				struct folio **foliop, void **fsdata)
 {
 	struct inode *inode = mapping->host;
 	struct folio *folio;
@@ -671,7 +673,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
 			mapping_gfp_mask(mapping));
 	if (IS_ERR(folio))
 		return PTR_ERR(folio);
-	*pagep = &folio->page;
+	*foliop = folio;
 
 	if (folio_test_uptodate(folio))
 		return 0;
@@ -685,11 +687,11 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
 	return err;
 }
 
-static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
+static int affs_write_end_ofs(const struct kiocb *iocb,
+			      struct address_space *mapping,
+			      loff_t pos, unsigned len, unsigned copied,
+			      struct folio *folio, void *fsdata)
 {
-	struct folio *folio = page_folio(page);
 	struct inode *inode = mapping->host;
 	struct super_block *sb = inode->i_sb;
 	struct buffer_head *bh, *prev_bh;
@@ -726,7 +728,8 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
 		tmp = min(bsize - boff, to - from);
 		BUG_ON(boff + tmp > bsize || tmp > bsize);
 		memcpy(AFFS_DATA(bh) + boff, data + from, tmp);
-		be32_add_cpu(&AFFS_DATA_HEAD(bh)->size, tmp);
+		AFFS_DATA_HEAD(bh)->size = cpu_to_be32(
+			max(boff + tmp, be32_to_cpu(AFFS_DATA_HEAD(bh)->size)));
 		affs_fix_checksum(sb, bh);
 		mark_buffer_dirty_inode(bh, inode);
 		written += tmp;
@@ -748,7 +751,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
 		if (buffer_new(bh)) {
 			AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA);
 			AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino);
-			AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx);
+			AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx + 1);
 			AFFS_DATA_HEAD(bh)->size = cpu_to_be32(bsize);
 			AFFS_DATA_HEAD(bh)->next = 0;
 			bh->b_state &= ~(1UL << BH_New);
@@ -782,7 +785,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
 		if (buffer_new(bh)) {
 			AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA);
 			AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino);
-			AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx);
+			AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx + 1);
 			AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp);
 			AFFS_DATA_HEAD(bh)->next = 0;
 			bh->b_state &= ~(1UL << BH_New);
@@ -882,14 +885,14 @@ affs_truncate(struct inode *inode)
 
 	if (inode->i_size > AFFS_I(inode)->mmu_private) {
 		struct address_space *mapping = inode->i_mapping;
-		struct page *page;
+		struct folio *folio;
 		void *fsdata = NULL;
 		loff_t isize = inode->i_size;
 		int res;
 
-		res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, &page, &fsdata);
+		res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, &folio, &fsdata);
 		if (!res)
-			res = mapping->a_ops->write_end(NULL, mapping, isize, 0, 0, page, fsdata);
+			res = mapping->a_ops->write_end(NULL, mapping, isize, 0, 0, folio, fsdata);
 		else
 			inode->i_size = AFFS_I(inode)->mmu_private;
 		mark_inode_dirty(inode);
@@ -1000,7 +1003,7 @@ const struct file_operations affs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read_iter	= generic_file_read_iter,
 	.write_iter	= generic_file_write_iter,
-	.mmap		= generic_file_mmap,
+	.mmap_prepare	= generic_file_mmap_prepare,
 	.open		= affs_file_open,
 	.release	= affs_file_release,
 	.fsync		= affs_file_fsync,
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 0210df8d3500..0bfc7d151dcd 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -29,7 +29,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
 	inode = iget_locked(sb, ino);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
-	if (!(inode->i_state & I_NEW))
+	if (!(inode_state_read_once(inode) & I_NEW))
 		return inode;
 
 	pr_debug("affs_iget(%lu)\n", inode->i_ino);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 8c154490a2d6..f883be50db12 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -273,7 +273,7 @@ affs_create(struct mnt_idmap *idmap, struct inode *dir,
 	return 0;
 }
 
-int
+struct dentry *
 affs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 	   struct dentry *dentry, umode_t mode)
 {
@@ -285,7 +285,7 @@ affs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 
 	inode = affs_new_inode(dir);
 	if (!inode)
-		return -ENOSPC;
+		return ERR_PTR(-ENOSPC);
 
 	inode->i_mode = S_IFDIR | mode;
 	affs_mode_to_prot(inode);
@@ -298,9 +298,9 @@ affs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 		clear_nlink(inode);
 		mark_inode_dirty(inode);
 		iput(inode);
-		return error;
+		return ERR_PTR(error);
 	}
-	return 0;
+	return NULL;
 }
 
 int
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 3c5821339609..44f8aa883100 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -14,7 +14,8 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/statfs.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
+#include <linux/fs_context.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
 #include <linux/cred.h>
@@ -27,7 +28,6 @@
 
 static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int affs_show_options(struct seq_file *m, struct dentry *root);
-static int affs_remount (struct super_block *sb, int *flags, char *data);
 
 static void
 affs_commit_super(struct super_block *sb, int wait)
@@ -155,140 +155,114 @@ static const struct super_operations affs_sops = {
 	.put_super	= affs_put_super,
 	.sync_fs	= affs_sync_fs,
 	.statfs		= affs_statfs,
-	.remount_fs	= affs_remount,
 	.show_options	= affs_show_options,
 };
 
 enum {
 	Opt_bs, Opt_mode, Opt_mufs, Opt_notruncate, Opt_prefix, Opt_protect,
 	Opt_reserved, Opt_root, Opt_setgid, Opt_setuid,
-	Opt_verbose, Opt_volume, Opt_ignore, Opt_err,
+	Opt_verbose, Opt_volume, Opt_ignore,
 };
 
-static const match_table_t tokens = {
-	{Opt_bs, "bs=%u"},
-	{Opt_mode, "mode=%o"},
-	{Opt_mufs, "mufs"},
-	{Opt_notruncate, "nofilenametruncate"},
-	{Opt_prefix, "prefix=%s"},
-	{Opt_protect, "protect"},
-	{Opt_reserved, "reserved=%u"},
-	{Opt_root, "root=%u"},
-	{Opt_setgid, "setgid=%u"},
-	{Opt_setuid, "setuid=%u"},
-	{Opt_verbose, "verbose"},
-	{Opt_volume, "volume=%s"},
-	{Opt_ignore, "grpquota"},
-	{Opt_ignore, "noquota"},
-	{Opt_ignore, "quota"},
-	{Opt_ignore, "usrquota"},
-	{Opt_err, NULL},
+struct affs_context {
+	kuid_t		uid;		/* uid to override */
+	kgid_t		gid;		/* gid to override */
+	unsigned int	mode;		/* mode to override */
+	unsigned int	reserved;	/* Number of reserved blocks */
+	int		root_block;	/* FFS root block number */
+	int		blocksize;	/* Initial device blksize */
+	char		*prefix;	/* Prefix for volumes and assigns */
+	char		volume[32];	/* Vol. prefix for absolute symlinks */
+	unsigned long	mount_flags;	/* Options */
 };
 
-static int
-parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, s32 *root,
-		int *blocksize, char **prefix, char *volume, unsigned long *mount_opts)
+static const struct fs_parameter_spec affs_param_spec[] = {
+	fsparam_u32	("bs",		Opt_bs),
+	fsparam_u32oct	("mode",	Opt_mode),
+	fsparam_flag	("mufs",	Opt_mufs),
+	fsparam_flag	("nofilenametruncate",	Opt_notruncate),
+	fsparam_string	("prefix",	Opt_prefix),
+	fsparam_flag	("protect",	Opt_protect),
+	fsparam_u32	("reserved",	Opt_reserved),
+	fsparam_u32	("root",	Opt_root),
+	fsparam_gid	("setgid",	Opt_setgid),
+	fsparam_uid	("setuid",	Opt_setuid),
+	fsparam_flag	("verbose",	Opt_verbose),
+	fsparam_string	("volume",	Opt_volume),
+	fsparam_flag	("grpquota",	Opt_ignore),
+	fsparam_flag	("noquota",	Opt_ignore),
+	fsparam_flag	("quota",	Opt_ignore),
+	fsparam_flag	("usrquota",	Opt_ignore),
+	{},
+};
+
+static int affs_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
-	char *p;
-	substring_t args[MAX_OPT_ARGS];
-
-	/* Fill in defaults */
-
-	*uid        = current_uid();
-	*gid        = current_gid();
-	*reserved   = 2;
-	*root       = -1;
-	*blocksize  = -1;
-	volume[0]   = ':';
-	volume[1]   = 0;
-	*mount_opts = 0;
-	if (!options)
-		return 1;
-
-	while ((p = strsep(&options, ",")) != NULL) {
-		int token, n, option;
-		if (!*p)
-			continue;
-
-		token = match_token(p, tokens, args);
-		switch (token) {
-		case Opt_bs:
-			if (match_int(&args[0], &n))
-				return 0;
-			if (n != 512 && n != 1024 && n != 2048
-			    && n != 4096) {
-				pr_warn("Invalid blocksize (512, 1024, 2048, 4096 allowed)\n");
-				return 0;
-			}
-			*blocksize = n;
-			break;
-		case Opt_mode:
-			if (match_octal(&args[0], &option))
-				return 0;
-			*mode = option & 0777;
-			affs_set_opt(*mount_opts, SF_SETMODE);
-			break;
-		case Opt_mufs:
-			affs_set_opt(*mount_opts, SF_MUFS);
-			break;
-		case Opt_notruncate:
-			affs_set_opt(*mount_opts, SF_NO_TRUNCATE);
-			break;
-		case Opt_prefix:
-			kfree(*prefix);
-			*prefix = match_strdup(&args[0]);
-			if (!*prefix)
-				return 0;
-			affs_set_opt(*mount_opts, SF_PREFIX);
-			break;
-		case Opt_protect:
-			affs_set_opt(*mount_opts, SF_IMMUTABLE);
-			break;
-		case Opt_reserved:
-			if (match_int(&args[0], reserved))
-				return 0;
-			break;
-		case Opt_root:
-			if (match_int(&args[0], root))
-				return 0;
-			break;
-		case Opt_setgid:
-			if (match_int(&args[0], &option))
-				return 0;
-			*gid = make_kgid(current_user_ns(), option);
-			if (!gid_valid(*gid))
-				return 0;
-			affs_set_opt(*mount_opts, SF_SETGID);
-			break;
-		case Opt_setuid:
-			if (match_int(&args[0], &option))
-				return 0;
-			*uid = make_kuid(current_user_ns(), option);
-			if (!uid_valid(*uid))
-				return 0;
-			affs_set_opt(*mount_opts, SF_SETUID);
-			break;
-		case Opt_verbose:
-			affs_set_opt(*mount_opts, SF_VERBOSE);
-			break;
-		case Opt_volume: {
-			char *vol = match_strdup(&args[0]);
-			if (!vol)
-				return 0;
-			strscpy(volume, vol, 32);
-			kfree(vol);
-			break;
-		}
-		case Opt_ignore:
-		 	/* Silently ignore the quota options */
-			break;
-		default:
-			pr_warn("Unrecognized mount option \"%s\" or missing value\n",
-				p);
-			return 0;
+	struct affs_context *ctx = fc->fs_private;
+	struct fs_parse_result result;
+	int n;
+	int opt;
+
+	opt = fs_parse(fc, affs_param_spec, param, &result);
+	if (opt < 0)
+		return opt;
+
+	switch (opt) {
+	case Opt_bs:
+		n = result.uint_32;
+		if (n != 512 && n != 1024 && n != 2048
+		    && n != 4096) {
+			pr_warn("Invalid blocksize (512, 1024, 2048, 4096 allowed)\n");
+			return -EINVAL;
 		}
+		ctx->blocksize = n;
+		break;
+	case Opt_mode:
+		ctx->mode = result.uint_32 & 0777;
+		affs_set_opt(ctx->mount_flags, SF_SETMODE);
+		break;
+	case Opt_mufs:
+		affs_set_opt(ctx->mount_flags, SF_MUFS);
+		break;
+	case Opt_notruncate:
+		affs_set_opt(ctx->mount_flags, SF_NO_TRUNCATE);
+		break;
+	case Opt_prefix:
+		kfree(ctx->prefix);
+		ctx->prefix = param->string;
+		param->string = NULL;
+		affs_set_opt(ctx->mount_flags, SF_PREFIX);
+		break;
+	case Opt_protect:
+		affs_set_opt(ctx->mount_flags, SF_IMMUTABLE);
+		break;
+	case Opt_reserved:
+		ctx->reserved = result.uint_32;
+		break;
+	case Opt_root:
+		ctx->root_block = result.uint_32;
+		break;
+	case Opt_setgid:
+		ctx->gid = result.gid;
+		affs_set_opt(ctx->mount_flags, SF_SETGID);
+		break;
+	case Opt_setuid:
+		ctx->uid = result.uid;
+		affs_set_opt(ctx->mount_flags, SF_SETUID);
+		break;
+	case Opt_verbose:
+		affs_set_opt(ctx->mount_flags, SF_VERBOSE);
+		break;
+	case Opt_volume:
+		strscpy(ctx->volume, param->string, 32);
+		break;
+	case Opt_ignore:
+		/* Silently ignore the quota options */
+		break;
+	default:
+		return -EINVAL;
 	}
-	return 1;
+	return 0;
 }
 
 static int affs_show_options(struct seq_file *m, struct dentry *root)
@@ -329,27 +303,22 @@ static int affs_show_options(struct seq_file *m, struct dentry *root)
  * hopefully have the guts to do so. Until then: sorry for the mess.
  */
 
-static int affs_fill_super(struct super_block *sb, void *data, int silent)
+static int affs_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	struct affs_sb_info	*sbi;
+	struct affs_context	*ctx = fc->fs_private;
 	struct buffer_head	*root_bh = NULL;
 	struct buffer_head	*boot_bh;
 	struct inode		*root_inode = NULL;
-	s32			 root_block;
+	int			 silent = fc->sb_flags & SB_SILENT;
 	int			 size, blocksize;
 	u32			 chksum;
 	int			 num_bm;
 	int			 i, j;
-	kuid_t			 uid;
-	kgid_t			 gid;
-	int			 reserved;
-	unsigned long		 mount_flags;
 	int			 tmp_flags;	/* fix remount prototype... */
 	u8			 sig[4];
 	int			 ret;
 
-	pr_debug("read_super(%s)\n", data ? (const char *)data : "no options");
-
 	sb->s_magic             = AFFS_SUPER_MAGIC;
 	sb->s_op                = &affs_sops;
 	sb->s_flags |= SB_NODIRATIME;
@@ -369,19 +338,16 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 	spin_lock_init(&sbi->work_lock);
 	INIT_DELAYED_WORK(&sbi->sb_work, flush_superblock);
 
-	if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block,
-				&blocksize,&sbi->s_prefix,
-				sbi->s_volume, &mount_flags)) {
-		pr_err("Error parsing options\n");
-		return -EINVAL;
-	}
-	/* N.B. after this point s_prefix must be released */
+	sbi->s_flags	= ctx->mount_flags;
+	sbi->s_mode	= ctx->mode;
+	sbi->s_uid	= ctx->uid;
+	sbi->s_gid	= ctx->gid;
+	sbi->s_reserved	= ctx->reserved;
+	sbi->s_prefix	= ctx->prefix;
+	ctx->prefix	= NULL;
+	memcpy(sbi->s_volume, ctx->volume, 32);
 
-	sbi->s_flags   = mount_flags;
-	sbi->s_mode    = i;
-	sbi->s_uid     = uid;
-	sbi->s_gid     = gid;
-	sbi->s_reserved= reserved;
+	/* N.B. after this point s_prefix must be released */
 
 	/* Get the size of the device in 512-byte blocks.
 	 * If we later see that the partition uses bigger
@@ -396,15 +362,16 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 
 	i = bdev_logical_block_size(sb->s_bdev);
 	j = PAGE_SIZE;
+	blocksize = ctx->blocksize;
 	if (blocksize > 0) {
 		i = j = blocksize;
 		size = size / (blocksize / 512);
 	}
 
 	for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) {
-		sbi->s_root_block = root_block;
-		if (root_block < 0)
-			sbi->s_root_block = (reserved + size - 1) / 2;
+		sbi->s_root_block = ctx->root_block;
+		if (ctx->root_block < 0)
+			sbi->s_root_block = (ctx->reserved + size - 1) / 2;
 		pr_debug("setting blocksize to %d\n", blocksize);
 		affs_set_blocksize(sb, blocksize);
 		sbi->s_partition_size = size;
@@ -424,7 +391,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 				"size=%d, reserved=%d\n",
 				sb->s_id,
 				sbi->s_root_block + num_bm,
-				blocksize, size, reserved);
+				ctx->blocksize, size, ctx->reserved);
 			root_bh = affs_bread(sb, sbi->s_root_block + num_bm);
 			if (!root_bh)
 				continue;
@@ -447,7 +414,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 got_root:
 	/* Keep super block in cache */
 	sbi->s_root_bh = root_bh;
-	root_block = sbi->s_root_block;
+	ctx->root_block = sbi->s_root_block;
 
 	/* Find out which kind of FS we have */
 	boot_bh = sb_bread(sb, 0);
@@ -506,7 +473,7 @@ got_root:
 		return -EINVAL;
 	}
 
-	if (affs_test_opt(mount_flags, SF_VERBOSE)) {
+	if (affs_test_opt(ctx->mount_flags, SF_VERBOSE)) {
 		u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0];
 		pr_notice("Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n",
 			len > 31 ? 31 : len,
@@ -528,14 +495,14 @@ got_root:
 
 	/* set up enough so that it can read an inode */
 
-	root_inode = affs_iget(sb, root_block);
+	root_inode = affs_iget(sb, ctx->root_block);
 	if (IS_ERR(root_inode))
 		return PTR_ERR(root_inode);
 
 	if (affs_test_opt(AFFS_SB(sb)->s_flags, SF_INTL))
-		sb->s_d_op = &affs_intl_dentry_operations;
+		set_default_d_op(sb, &affs_intl_dentry_operations);
 	else
-		sb->s_d_op = &affs_dentry_operations;
+		set_default_d_op(sb, &affs_dentry_operations);
 
 	sb->s_root = d_make_root(root_inode);
 	if (!sb->s_root) {
@@ -548,56 +515,43 @@ got_root:
 	return 0;
 }
 
-static int
-affs_remount(struct super_block *sb, int *flags, char *data)
+static int affs_reconfigure(struct fs_context *fc)
 {
+	struct super_block	*sb = fc->root->d_sb;
+	struct affs_context	*ctx = fc->fs_private;
 	struct affs_sb_info	*sbi = AFFS_SB(sb);
-	int			 blocksize;
-	kuid_t			 uid;
-	kgid_t			 gid;
-	int			 mode;
-	int			 reserved;
-	int			 root_block;
-	unsigned long		 mount_flags;
 	int			 res = 0;
-	char			 volume[32];
-	char			*prefix = NULL;
-
-	pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data);
 
 	sync_filesystem(sb);
-	*flags |= SB_NODIRATIME;
-
-	memcpy(volume, sbi->s_volume, 32);
-	if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block,
-			   &blocksize, &prefix, volume,
-			   &mount_flags)) {
-		kfree(prefix);
-		return -EINVAL;
-	}
+	fc->sb_flags |= SB_NODIRATIME;
 
 	flush_delayed_work(&sbi->sb_work);
 
-	sbi->s_flags = mount_flags;
-	sbi->s_mode  = mode;
-	sbi->s_uid   = uid;
-	sbi->s_gid   = gid;
+	/*
+	 * NB: Historically, only mount_flags, mode, uid, gic, prefix,
+	 * and volume are accepted during remount.
+	 */
+	sbi->s_flags = ctx->mount_flags;
+	sbi->s_mode  = ctx->mode;
+	sbi->s_uid   = ctx->uid;
+	sbi->s_gid   = ctx->gid;
 	/* protect against readers */
 	spin_lock(&sbi->symlink_lock);
-	if (prefix) {
+	if (ctx->prefix) {
 		kfree(sbi->s_prefix);
-		sbi->s_prefix = prefix;
+		sbi->s_prefix = ctx->prefix;
+		ctx->prefix = NULL;
 	}
-	memcpy(sbi->s_volume, volume, 32);
+	memcpy(sbi->s_volume, ctx->volume, 32);
 	spin_unlock(&sbi->symlink_lock);
 
-	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
+	if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb))
 		return 0;
 
-	if (*flags & SB_RDONLY)
+	if (fc->sb_flags & SB_RDONLY)
 		affs_free_bitmap(sb);
 	else
-		res = affs_init_bitmap(sb, flags);
+		res = affs_init_bitmap(sb, &fc->sb_flags);
 
 	return res;
 }
@@ -624,10 +578,9 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-static struct dentry *affs_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int affs_get_tree(struct fs_context *fc)
 {
-	return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super);
+	return get_tree_bdev(fc, affs_fill_super);
 }
 
 static void affs_kill_sb(struct super_block *sb)
@@ -643,12 +596,61 @@ static void affs_kill_sb(struct super_block *sb)
 	}
 }
 
+static void affs_free_fc(struct fs_context *fc)
+{
+	struct affs_context *ctx = fc->fs_private;
+
+	kfree(ctx->prefix);
+	kfree(ctx);
+}
+
+static const struct fs_context_operations affs_context_ops = {
+	.parse_param	= affs_parse_param,
+	.get_tree	= affs_get_tree,
+	.reconfigure	= affs_reconfigure,
+	.free		= affs_free_fc,
+};
+
+static int affs_init_fs_context(struct fs_context *fc)
+{
+	struct affs_context *ctx;
+
+	ctx = kzalloc(sizeof(struct affs_context), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+		struct super_block *sb = fc->root->d_sb;
+		struct affs_sb_info *sbi = AFFS_SB(sb);
+
+		/*
+		 * NB: historically, no options other than volume were
+		 * preserved across a remount unless they were explicitly
+		 * passed in.
+		 */
+		memcpy(ctx->volume, sbi->s_volume, 32);
+	} else {
+		ctx->uid	= current_uid();
+		ctx->gid	= current_gid();
+		ctx->reserved	= 2;
+		ctx->root_block	= -1;
+		ctx->blocksize	= -1;
+		ctx->volume[0]	= ':';
+	}
+
+	fc->ops = &affs_context_ops;
+	fc->fs_private = ctx;
+
+	return 0;
+}
+
 static struct file_system_type affs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "affs",
-	.mount		= affs_mount,
 	.kill_sb	= affs_kill_sb,
 	.fs_flags	= FS_REQUIRES_DEV,
+	.init_fs_context = affs_init_fs_context,
+	.parameters	= affs_param_spec,
 };
 MODULE_ALIAS_FS("affs");
 
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index fc8ba9142f2f..682bd8ec2c10 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -5,6 +5,7 @@ config AFS_FS
 	select AF_RXRPC
 	select DNS_RESOLVER
 	select NETFS_SUPPORT
+	select CRYPTO_KRB5
 	help
 	  If you say Y here, you will get an experimental Andrew File System
 	  driver. It currently only supports unsecured read-only AFS access.
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index dcdc0f1bb76f..b49b8fe682f3 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -8,9 +8,11 @@ kafs-y := \
 	addr_prefs.o \
 	callback.o \
 	cell.o \
+	cm_security.o \
 	cmservice.o \
 	dir.o \
 	dir_edit.o \
+	dir_search.o \
 	dir_silly.o \
 	dynroot.o \
 	file.o \
diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
index 6d42f85c6be5..e941da5b6dd9 100644
--- a/fs/afs/addr_list.c
+++ b/fs/afs/addr_list.c
@@ -362,3 +362,53 @@ int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *alist,
 	alist->nr_addrs++;
 	return 0;
 }
+
+/*
+ * Set the app data on the rxrpc peers an address list points to
+ */
+void afs_set_peer_appdata(struct afs_server *server,
+			  struct afs_addr_list *old_alist,
+			  struct afs_addr_list *new_alist)
+{
+	unsigned long data = (unsigned long)server;
+	int n = 0, o = 0;
+
+	if (!old_alist) {
+		/* New server.  Just set all. */
+		for (; n < new_alist->nr_addrs; n++)
+			rxrpc_kernel_set_peer_data(new_alist->addrs[n].peer, data);
+		return;
+	}
+	if (!new_alist) {
+		/* Dead server.  Just remove all. */
+		for (; o < old_alist->nr_addrs; o++)
+			rxrpc_kernel_set_peer_data(old_alist->addrs[o].peer, 0);
+		return;
+	}
+
+	/* Walk through the two lists simultaneously, setting new peers and
+	 * clearing old ones.  The two lists are ordered by pointer to peer
+	 * record.
+	 */
+	while (n < new_alist->nr_addrs && o < old_alist->nr_addrs) {
+		struct rxrpc_peer *pn = new_alist->addrs[n].peer;
+		struct rxrpc_peer *po = old_alist->addrs[o].peer;
+
+		if (pn == po)
+			continue;
+		if (pn < po) {
+			rxrpc_kernel_set_peer_data(pn, data);
+			n++;
+		} else {
+			rxrpc_kernel_set_peer_data(po, 0);
+			o++;
+		}
+	}
+
+	if (n < new_alist->nr_addrs)
+		for (; n < new_alist->nr_addrs; n++)
+			rxrpc_kernel_set_peer_data(new_alist->addrs[n].peer, data);
+	if (o < old_alist->nr_addrs)
+		for (; o < old_alist->nr_addrs; o++)
+			rxrpc_kernel_set_peer_data(old_alist->addrs[o].peer, 0);
+}
diff --git a/fs/afs/addr_prefs.c b/fs/afs/addr_prefs.c
index a189ff8a5034..133736412c3d 100644
--- a/fs/afs/addr_prefs.c
+++ b/fs/afs/addr_prefs.c
@@ -48,7 +48,7 @@ static int afs_split_string(char **pbuf, char *strv[], unsigned int maxstrv)
 		strv[count++] = p;
 
 		/* Skip over word */
-		while (!isspace(*p))
+		while (!isspace(*p) && *p)
 			p++;
 		if (!*p)
 			break;
@@ -413,8 +413,10 @@ int afs_proc_addr_prefs_write(struct file *file, char *buf, size_t size)
 
 	do {
 		argc = afs_split_string(&buf, argv, ARRAY_SIZE(argv));
-		if (argc < 0)
-			return argc;
+		if (argc < 0) {
+			ret = argc;
+			goto done;
+		}
 		if (argc < 2)
 			goto inval;
 
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
index b488072aee87..ec3db00bd081 100644
--- a/fs/afs/afs.h
+++ b/fs/afs/afs.h
@@ -10,7 +10,7 @@
 
 #include <linux/in.h>
 
-#define AFS_MAXCELLNAME		256  	/* Maximum length of a cell name */
+#define AFS_MAXCELLNAME		253  	/* Maximum length of a cell name (DNS limited) */
 #define AFS_MAXVOLNAME		64  	/* Maximum length of a volume name */
 #define AFS_MAXNSERVERS		8   	/* Maximum servers in a basic volume record */
 #define AFS_NMAXNSERVERS	13  	/* Maximum servers in a N/U-class volume record */
diff --git a/fs/afs/afs_vl.h b/fs/afs/afs_vl.h
index 9c65ffb8a523..b835e25a2c02 100644
--- a/fs/afs/afs_vl.h
+++ b/fs/afs/afs_vl.h
@@ -13,6 +13,7 @@
 #define AFS_VL_PORT		7003	/* volume location service port */
 #define VL_SERVICE		52	/* RxRPC service ID for the Volume Location service */
 #define YFS_VL_SERVICE		2503	/* Service ID for AuriStor upgraded VL service */
+#define YFS_VL_MAXCELLNAME	256  	/* Maximum length of a cell name in YFS protocol */
 
 enum AFSVL_Operations {
 	VLGETENTRYBYID		= 503,	/* AFS Get VLDB entry by ID */
@@ -134,13 +135,4 @@ struct afs_uvldbentry__xdr {
 	__be32			spares9;
 };
 
-struct afs_address_list {
-	refcount_t		usage;
-	unsigned int		version;
-	unsigned int		nr_addrs;
-	struct sockaddr_rxrpc	addrs[];
-};
-
-extern void afs_put_address_list(struct afs_address_list *alist);
-
 #endif /* AFS_VL_H */
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 99b2c8172021..894d2bad6b6c 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -41,8 +41,8 @@ static void afs_volume_init_callback(struct afs_volume *volume)
 
 	list_for_each_entry(vnode, &volume->open_mmaps, cb_mmap_link) {
 		if (vnode->cb_v_check != atomic_read(&volume->cb_v_break)) {
-			atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
-			queue_work(system_unbound_wq, &vnode->cb_work);
+			afs_clear_cb_promise(vnode, afs_cb_promise_clear_vol_init_cb);
+			queue_work(system_dfl_wq, &vnode->cb_work);
 		}
 	}
 
@@ -79,7 +79,7 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas
 	_enter("");
 
 	clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
-	if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE) {
+	if (afs_clear_cb_promise(vnode, afs_cb_promise_clear_cb_break)) {
 		vnode->cb_break++;
 		vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break);
 		afs_clear_permits(vnode);
@@ -90,7 +90,7 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas
 		if (reason != afs_cb_break_for_deleted &&
 		    vnode->status.type == AFS_FTYPE_FILE &&
 		    atomic_read(&vnode->cb_nr_mmap))
-			queue_work(system_unbound_wq, &vnode->cb_work);
+			queue_work(system_dfl_wq, &vnode->cb_work);
 
 		trace_afs_cb_break(&vnode->fid, vnode->cb_break, reason, true);
 	} else {
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index caa09875f520..71c10a05cebe 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -20,8 +20,9 @@ static unsigned __read_mostly afs_cell_min_ttl = 10 * 60;
 static unsigned __read_mostly afs_cell_max_ttl = 24 * 60 * 60;
 static atomic_t cell_debug_id;
 
-static void afs_queue_cell_manager(struct afs_net *);
-static void afs_manage_cell_work(struct work_struct *);
+static void afs_cell_timer(struct timer_list *timer);
+static void afs_destroy_cell_work(struct work_struct *work);
+static void afs_manage_cell_work(struct work_struct *work);
 
 static void afs_dec_cells_outstanding(struct afs_net *net)
 {
@@ -29,19 +30,11 @@ static void afs_dec_cells_outstanding(struct afs_net *net)
 		wake_up_var(&net->cells_outstanding);
 }
 
-/*
- * Set the cell timer to fire after a given delay, assuming it's not already
- * set for an earlier time.
- */
-static void afs_set_cell_timer(struct afs_net *net, time64_t delay)
+static void afs_set_cell_state(struct afs_cell *cell, enum afs_cell_state state)
 {
-	if (net->live) {
-		atomic_inc(&net->cells_outstanding);
-		if (timer_reduce(&net->cells_timer, jiffies + delay * HZ))
-			afs_dec_cells_outstanding(net);
-	} else {
-		afs_queue_cell_manager(net);
-	}
+	smp_store_release(&cell->state, state); /* Commit cell changes before state */
+	smp_wmb(); /* Set cell state before task state */
+	wake_up_var(&cell->state);
 }
 
 /*
@@ -64,7 +57,8 @@ static struct afs_cell *afs_find_cell_locked(struct afs_net *net,
 		return ERR_PTR(-ENAMETOOLONG);
 
 	if (!name) {
-		cell = net->ws_cell;
+		cell = rcu_dereference_protected(net->ws_cell,
+						 lockdep_is_held(&net->cells_lock));
 		if (!cell)
 			return ERR_PTR(-EDESTADDRREQ);
 		goto found;
@@ -115,7 +109,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
 				       const char *name, unsigned int namelen,
 				       const char *addresses)
 {
-	struct afs_vlserver_list *vllist;
+	struct afs_vlserver_list *vllist = NULL;
 	struct afs_cell *cell;
 	int i, ret;
 
@@ -146,27 +140,37 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
 		return ERR_PTR(-ENOMEM);
 	}
 
-	cell->name = kmalloc(namelen + 1, GFP_KERNEL);
+	/* Allocate the cell name and the key name in one go. */
+	cell->name = kmalloc(1 + namelen + 1 +
+			     4 + namelen + 1, GFP_KERNEL);
 	if (!cell->name) {
 		kfree(cell);
 		return ERR_PTR(-ENOMEM);
 	}
 
-	cell->net = net;
+	cell->name[0] = '.';
+	cell->name++;
 	cell->name_len = namelen;
 	for (i = 0; i < namelen; i++)
 		cell->name[i] = tolower(name[i]);
-	cell->name[i] = 0;
+	cell->name[i++] = 0;
 
+	cell->key_desc = cell->name + i;
+	memcpy(cell->key_desc, "afs@", 4);
+	memcpy(cell->key_desc + 4, cell->name, cell->name_len + 1);
+
+	cell->net = net;
 	refcount_set(&cell->ref, 1);
 	atomic_set(&cell->active, 0);
+	INIT_WORK(&cell->destroyer, afs_destroy_cell_work);
 	INIT_WORK(&cell->manager, afs_manage_cell_work);
+	timer_setup(&cell->management_timer, afs_cell_timer, 0);
 	init_rwsem(&cell->vs_lock);
 	cell->volumes = RB_ROOT;
 	INIT_HLIST_HEAD(&cell->proc_volumes);
 	seqlock_init(&cell->volume_lock);
 	cell->fs_servers = RB_ROOT;
-	seqlock_init(&cell->fs_lock);
+	init_rwsem(&cell->fs_lock);
 	rwlock_init(&cell->vl_servers_lock);
 	cell->flags = (1 << AFS_CELL_FL_CHECK_ALIAS);
 
@@ -179,6 +183,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
 					      VL_SERVICE, AFS_VL_PORT);
 		if (IS_ERR(vllist)) {
 			ret = PTR_ERR(vllist);
+			vllist = NULL;
 			goto parse_failed;
 		}
 
@@ -201,7 +206,13 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
 	cell->dns_status = vllist->status;
 	smp_store_release(&cell->dns_lookup_count, 1); /* vs source/status */
 	atomic_inc(&net->cells_outstanding);
+	ret = idr_alloc_cyclic(&net->cells_dyn_ino, cell,
+			       2, INT_MAX / 2, GFP_KERNEL);
+	if (ret < 0)
+		goto error;
+	cell->dynroot_ino = ret;
 	cell->debug_id = atomic_inc_return(&cell_debug_id);
+
 	trace_afs_cell(cell->debug_id, 1, 0, afs_cell_trace_alloc);
 
 	_leave(" = %p", cell);
@@ -211,7 +222,8 @@ parse_failed:
 	if (ret == -EINVAL)
 		printk(KERN_ERR "kAFS: bad VL server IP address\n");
 error:
-	kfree(cell->name);
+	afs_put_vlserverlist(cell->net, vllist);
+	kfree(cell->name - 1);
 	kfree(cell);
 	_leave(" = %d", ret);
 	return ERR_PTR(ret);
@@ -223,7 +235,8 @@ error:
  * @name:	The name of the cell.
  * @namesz:	The strlen of the cell name.
  * @vllist:	A colon/comma separated list of numeric IP addresses or NULL.
- * @excl:	T if an error should be given if the cell name already exists.
+ * @reason:	The reason we're doing the lookup
+ * @trace:	The reason to be logged if the lookup is successful.
  *
  * Look up a cell record by name and query the DNS for VL server addresses if
  * needed.  Note that that actual DNS query is punted off to the manager thread
@@ -232,19 +245,27 @@ error:
  */
 struct afs_cell *afs_lookup_cell(struct afs_net *net,
 				 const char *name, unsigned int namesz,
-				 const char *vllist, bool excl)
+				 const char *vllist,
+				 enum afs_lookup_cell_for reason,
+				 enum afs_cell_trace trace)
 {
 	struct afs_cell *cell, *candidate, *cursor;
 	struct rb_node *parent, **pp;
 	enum afs_cell_state state;
 	int ret, n;
 
-	_enter("%s,%s", name, vllist);
+	_enter("%s,%s,%u", name, vllist, reason);
 
-	if (!excl) {
-		cell = afs_find_cell(net, name, namesz, afs_cell_trace_use_lookup);
-		if (!IS_ERR(cell))
+	if (reason != AFS_LOOKUP_CELL_PRELOAD) {
+		cell = afs_find_cell(net, name, namesz, trace);
+		if (!IS_ERR(cell)) {
+			if (reason == AFS_LOOKUP_CELL_DYNROOT)
+				goto no_wait;
+			if (cell->state == AFS_CELL_SETTING_UP ||
+			    cell->state == AFS_CELL_UNLOOKED)
+				goto lookup_cell;
 			goto wait_for_cell;
+		}
 	}
 
 	/* Assume we're probably going to create a cell and preallocate and
@@ -285,29 +306,74 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
 
 	cell = candidate;
 	candidate = NULL;
-	atomic_set(&cell->active, 2);
-	trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), 2, afs_cell_trace_insert);
+	afs_use_cell(cell, trace);
 	rb_link_node_rcu(&cell->net_node, parent, pp);
 	rb_insert_color(&cell->net_node, &net->cells);
 	up_write(&net->cells_lock);
 
-	afs_queue_cell(cell, afs_cell_trace_get_queue_new);
+lookup_cell:
+	if (reason != AFS_LOOKUP_CELL_PRELOAD &&
+	    reason != AFS_LOOKUP_CELL_ROOTCELL) {
+		set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags);
+		afs_queue_cell(cell, afs_cell_trace_queue_new);
+	}
 
 wait_for_cell:
-	trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), atomic_read(&cell->active),
-		       afs_cell_trace_wait);
-	_debug("wait_for_cell");
-	wait_var_event(&cell->state,
-		       ({
-			       state = smp_load_acquire(&cell->state); /* vs error */
-			       state == AFS_CELL_ACTIVE || state == AFS_CELL_REMOVED;
-		       }));
+	state = smp_load_acquire(&cell->state); /* vs error */
+	switch (state) {
+	case AFS_CELL_ACTIVE:
+	case AFS_CELL_DEAD:
+		break;
+	case AFS_CELL_UNLOOKED:
+	default:
+		if (reason == AFS_LOOKUP_CELL_PRELOAD ||
+		    reason == AFS_LOOKUP_CELL_ROOTCELL)
+			break;
+		_debug("wait_for_cell");
+		afs_see_cell(cell, afs_cell_trace_wait);
+		wait_var_event(&cell->state,
+			       ({
+				       state = smp_load_acquire(&cell->state); /* vs error */
+				       state == AFS_CELL_ACTIVE || state == AFS_CELL_DEAD;
+			       }));
+		_debug("waited_for_cell %d %d", cell->state, cell->error);
+	}
 
+no_wait:
 	/* Check the state obtained from the wait check. */
-	if (state == AFS_CELL_REMOVED) {
+	state = smp_load_acquire(&cell->state); /* vs error */
+	if (state == AFS_CELL_DEAD) {
 		ret = cell->error;
 		goto error;
 	}
+	if (state == AFS_CELL_ACTIVE) {
+		switch (cell->dns_status) {
+		case DNS_LOOKUP_NOT_DONE:
+			if (cell->dns_source == DNS_RECORD_FROM_CONFIG) {
+				ret = 0;
+				break;
+			}
+			fallthrough;
+		default:
+			ret = -EIO;
+			goto error;
+		case DNS_LOOKUP_GOOD:
+		case DNS_LOOKUP_GOOD_WITH_BAD:
+			ret = 0;
+			break;
+		case DNS_LOOKUP_GOT_NOT_FOUND:
+			ret = -ENOENT;
+			goto error;
+		case DNS_LOOKUP_BAD:
+			ret = -EREMOTEIO;
+			goto error;
+		case DNS_LOOKUP_GOT_LOCAL_FAILURE:
+		case DNS_LOOKUP_GOT_TEMP_FAILURE:
+		case DNS_LOOKUP_GOT_NS_FAILURE:
+			ret = -EDESTADDRREQ;
+			goto error;
+		}
+	}
 
 	_leave(" = %p [cell]", cell);
 	return cell;
@@ -315,10 +381,10 @@ wait_for_cell:
 cell_already_exists:
 	_debug("cell exists");
 	cell = cursor;
-	if (excl) {
+	if (reason == AFS_LOOKUP_CELL_PRELOAD) {
 		ret = -EEXIST;
 	} else {
-		afs_use_cell(cursor, afs_cell_trace_use_lookup);
+		afs_use_cell(cursor, trace);
 		ret = 0;
 	}
 	up_write(&net->cells_lock);
@@ -328,7 +394,7 @@ cell_already_exists:
 		goto wait_for_cell;
 	goto error_noput;
 error:
-	afs_unuse_cell(net, cell, afs_cell_trace_unuse_lookup);
+	afs_unuse_cell(cell, afs_cell_trace_unuse_lookup_error);
 error_noput:
 	_leave(" = %d [error]", ret);
 	return ERR_PTR(ret);
@@ -365,8 +431,18 @@ int afs_cell_init(struct afs_net *net, const char *rootcell)
 		len = cp - rootcell;
 	}
 
-	/* allocate a cell record for the root cell */
-	new_root = afs_lookup_cell(net, rootcell, len, vllist, false);
+	if (len == 0 || !rootcell[0] || rootcell[0] == '.' || rootcell[len - 1] == '.')
+		return -EINVAL;
+	if (memchr(rootcell, '/', len))
+		return -EINVAL;
+	cp = strstr(rootcell, "..");
+	if (cp && cp < rootcell + len)
+		return -EINVAL;
+
+	/* allocate a cell record for the root/workstation cell */
+	new_root = afs_lookup_cell(net, rootcell, len, vllist,
+				   AFS_LOOKUP_CELL_ROOTCELL,
+				   afs_cell_trace_use_lookup_ws);
 	if (IS_ERR(new_root)) {
 		_leave(" = %ld", PTR_ERR(new_root));
 		return PTR_ERR(new_root);
@@ -377,12 +453,11 @@ int afs_cell_init(struct afs_net *net, const char *rootcell)
 
 	/* install the new cell */
 	down_write(&net->cells_lock);
-	afs_see_cell(new_root, afs_cell_trace_see_ws);
-	old_root = net->ws_cell;
-	net->ws_cell = new_root;
+	old_root = rcu_replace_pointer(net->ws_cell, new_root,
+				       lockdep_is_held(&net->cells_lock));
 	up_write(&net->cells_lock);
 
-	afs_unuse_cell(net, old_root, afs_cell_trace_unuse_ws);
+	afs_unuse_cell(old_root, afs_cell_trace_unuse_ws);
 	_leave(" = 0");
 	return 0;
 }
@@ -500,39 +575,24 @@ static void afs_cell_destroy(struct rcu_head *rcu)
 	trace_afs_cell(cell->debug_id, r, atomic_read(&cell->active), afs_cell_trace_free);
 
 	afs_put_vlserverlist(net, rcu_access_pointer(cell->vl_servers));
-	afs_unuse_cell(net, cell->alias_of, afs_cell_trace_unuse_alias);
+	afs_unuse_cell(cell->alias_of, afs_cell_trace_unuse_alias);
 	key_put(cell->anonymous_key);
-	kfree(cell->name);
+	idr_remove(&net->cells_dyn_ino, cell->dynroot_ino);
+	kfree(cell->name - 1);
 	kfree(cell);
 
 	afs_dec_cells_outstanding(net);
 	_leave(" [destroyed]");
 }
 
-/*
- * Queue the cell manager.
- */
-static void afs_queue_cell_manager(struct afs_net *net)
-{
-	int outstanding = atomic_inc_return(&net->cells_outstanding);
-
-	_enter("%d", outstanding);
-
-	if (!queue_work(afs_wq, &net->cells_manager))
-		afs_dec_cells_outstanding(net);
-}
-
-/*
- * Cell management timer.  We have an increment on cells_outstanding that we
- * need to pass along to the work item.
- */
-void afs_cells_timer(struct timer_list *timer)
+static void afs_destroy_cell_work(struct work_struct *work)
 {
-	struct afs_net *net = container_of(timer, struct afs_net, cells_timer);
+	struct afs_cell *cell = container_of(work, struct afs_cell, destroyer);
 
-	_enter("");
-	if (!queue_work(afs_wq, &net->cells_manager))
-		afs_dec_cells_outstanding(net);
+	afs_see_cell(cell, afs_cell_trace_destroy);
+	timer_delete_sync(&cell->management_timer);
+	cancel_work_sync(&cell->manager);
+	call_rcu(&cell->rcu, afs_cell_destroy);
 }
 
 /*
@@ -564,7 +624,7 @@ void afs_put_cell(struct afs_cell *cell, enum afs_cell_trace reason)
 		if (zero) {
 			a = atomic_read(&cell->active);
 			WARN(a != 0, "Cell active count %u > 0\n", a);
-			call_rcu(&cell->rcu, afs_cell_destroy);
+			WARN_ON(!queue_work(afs_wq, &cell->destroyer));
 		}
 	}
 }
@@ -576,10 +636,9 @@ struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason)
 {
 	int r, a;
 
-	r = refcount_read(&cell->ref);
-	WARN_ON(r == 0);
+	__refcount_inc(&cell->ref, &r);
 	a = atomic_inc_return(&cell->active);
-	trace_afs_cell(cell->debug_id, r, a, reason);
+	trace_afs_cell(cell->debug_id, r + 1, a, reason);
 	return cell;
 }
 
@@ -587,10 +646,11 @@ struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason)
  * Record a cell becoming less active.  When the active counter reaches 1, it
  * is scheduled for destruction, but may get reactivated.
  */
-void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_trace reason)
+void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason)
 {
 	unsigned int debug_id;
 	time64_t now, expire_delay;
+	bool zero;
 	int r, a;
 
 	if (!cell)
@@ -605,13 +665,15 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr
 		expire_delay = afs_cell_gc_delay;
 
 	debug_id = cell->debug_id;
-	r = refcount_read(&cell->ref);
 	a = atomic_dec_return(&cell->active);
-	trace_afs_cell(debug_id, r, a, reason);
-	WARN_ON(a == 0);
-	if (a == 1)
+	if (!a)
 		/* 'cell' may now be garbage collected. */
-		afs_set_cell_timer(net, expire_delay);
+		afs_set_cell_timer(cell, expire_delay);
+
+	zero = __refcount_dec_and_test(&cell->ref, &r);
+	trace_afs_cell(debug_id, r - 1, a, reason);
+	if (zero)
+		WARN_ON(!queue_work(afs_wq, &cell->destroyer));
 }
 
 /*
@@ -631,36 +693,27 @@ void afs_see_cell(struct afs_cell *cell, enum afs_cell_trace reason)
  */
 void afs_queue_cell(struct afs_cell *cell, enum afs_cell_trace reason)
 {
-	afs_get_cell(cell, reason);
-	if (!queue_work(afs_wq, &cell->manager))
-		afs_put_cell(cell, afs_cell_trace_put_queue_fail);
+	queue_work(afs_wq, &cell->manager);
 }
 
 /*
- * Allocate a key to use as a placeholder for anonymous user security.
+ * Cell-specific management timer.
  */
-static int afs_alloc_anon_key(struct afs_cell *cell)
+static void afs_cell_timer(struct timer_list *timer)
 {
-	struct key *key;
-	char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp;
-
-	/* Create a key to represent an anonymous user. */
-	memcpy(keyname, "afs@", 4);
-	dp = keyname + 4;
-	cp = cell->name;
-	do {
-		*dp++ = tolower(*cp);
-	} while (*cp++);
+	struct afs_cell *cell = container_of(timer, struct afs_cell, management_timer);
 
-	key = rxrpc_get_null_key(keyname);
-	if (IS_ERR(key))
-		return PTR_ERR(key);
-
-	cell->anonymous_key = key;
+	afs_see_cell(cell, afs_cell_trace_see_mgmt_timer);
+	if (refcount_read(&cell->ref) > 0 && cell->net->live)
+		queue_work(afs_wq, &cell->manager);
+}
 
-	_debug("anon key %p{%x}",
-	       cell->anonymous_key, key_serial(cell->anonymous_key));
-	return 0;
+/*
+ * Set/reduce the cell timer.
+ */
+void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs)
+{
+	timer_reduce(&cell->management_timer, jiffies + delay_secs * HZ);
 }
 
 /*
@@ -672,12 +725,6 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell)
 	struct afs_cell *pcell;
 	int ret;
 
-	if (!cell->anonymous_key) {
-		ret = afs_alloc_anon_key(cell);
-		if (ret < 0)
-			return ret;
-	}
-
 	ret = afs_proc_cell_setup(cell);
 	if (ret < 0)
 		return ret;
@@ -695,7 +742,6 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell)
 	if (cell->proc_link.next)
 		cell->proc_link.next->pprev = &cell->proc_link.next;
 
-	afs_dynroot_mkdir(net, cell);
 	mutex_unlock(&net->proc_cells_lock);
 	return 0;
 }
@@ -710,242 +756,167 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell)
 	afs_proc_cell_remove(cell);
 
 	mutex_lock(&net->proc_cells_lock);
-	hlist_del_rcu(&cell->proc_link);
-	afs_dynroot_rmdir(net, cell);
+	if (!hlist_unhashed(&cell->proc_link))
+		hlist_del_rcu(&cell->proc_link);
 	mutex_unlock(&net->proc_cells_lock);
 
 	_leave("");
 }
 
+static bool afs_has_cell_expired(struct afs_cell *cell, time64_t *_next_manage)
+{
+	const struct afs_vlserver_list *vllist;
+	time64_t expire_at = cell->last_inactive;
+	time64_t now = ktime_get_real_seconds();
+
+	if (atomic_read(&cell->active))
+		return false;
+	if (!cell->net->live)
+		return true;
+
+	vllist = rcu_dereference_protected(cell->vl_servers, true);
+	if (vllist && vllist->nr_servers > 0)
+		expire_at += afs_cell_gc_delay;
+
+	if (expire_at <= now)
+		return true;
+	if (expire_at < *_next_manage)
+		*_next_manage = expire_at;
+	return false;
+}
+
 /*
  * Manage a cell record, initialising and destroying it, maintaining its DNS
  * records.
  */
-static void afs_manage_cell(struct afs_cell *cell)
+static bool afs_manage_cell(struct afs_cell *cell)
 {
 	struct afs_net *net = cell->net;
-	int ret, active;
+	time64_t next_manage = TIME64_MAX;
+	int ret;
 
 	_enter("%s", cell->name);
 
-again:
 	_debug("state %u", cell->state);
 	switch (cell->state) {
-	case AFS_CELL_INACTIVE:
-	case AFS_CELL_FAILED:
-		down_write(&net->cells_lock);
-		active = 1;
-		if (atomic_try_cmpxchg_relaxed(&cell->active, &active, 0)) {
-			rb_erase(&cell->net_node, &net->cells);
-			trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), 0,
-				       afs_cell_trace_unuse_delete);
-			smp_store_release(&cell->state, AFS_CELL_REMOVED);
-		}
-		up_write(&net->cells_lock);
-		if (cell->state == AFS_CELL_REMOVED) {
-			wake_up_var(&cell->state);
-			goto final_destruction;
-		}
-		if (cell->state == AFS_CELL_FAILED)
-			goto done;
-		smp_store_release(&cell->state, AFS_CELL_UNSET);
-		wake_up_var(&cell->state);
-		goto again;
-
-	case AFS_CELL_UNSET:
-		smp_store_release(&cell->state, AFS_CELL_ACTIVATING);
-		wake_up_var(&cell->state);
-		goto again;
-
-	case AFS_CELL_ACTIVATING:
-		ret = afs_activate_cell(net, cell);
-		if (ret < 0)
-			goto activation_failed;
+	case AFS_CELL_SETTING_UP:
+		goto set_up_cell;
+	case AFS_CELL_UNLOOKED:
+	case AFS_CELL_ACTIVE:
+		goto cell_is_active;
+	case AFS_CELL_REMOVING:
+		WARN_ON_ONCE(1);
+		return false;
+	case AFS_CELL_DEAD:
+		return false;
+	default:
+		_debug("bad state %u", cell->state);
+		WARN_ON_ONCE(1); /* Unhandled state */
+		return false;
+	}
 
-		smp_store_release(&cell->state, AFS_CELL_ACTIVE);
-		wake_up_var(&cell->state);
-		goto again;
+set_up_cell:
+	ret = afs_activate_cell(net, cell);
+	if (ret < 0) {
+		cell->error = ret;
+		goto remove_cell;
+	}
 
-	case AFS_CELL_ACTIVE:
-		if (atomic_read(&cell->active) > 1) {
-			if (test_and_clear_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) {
-				ret = afs_update_cell(cell);
-				if (ret < 0)
-					cell->error = ret;
-			}
-			goto done;
-		}
-		smp_store_release(&cell->state, AFS_CELL_DEACTIVATING);
-		wake_up_var(&cell->state);
-		goto again;
+	afs_set_cell_state(cell, AFS_CELL_UNLOOKED);
 
-	case AFS_CELL_DEACTIVATING:
-		if (atomic_read(&cell->active) > 1)
-			goto reverse_deactivation;
-		afs_deactivate_cell(net, cell);
-		smp_store_release(&cell->state, AFS_CELL_INACTIVE);
-		wake_up_var(&cell->state);
-		goto again;
+cell_is_active:
+	if (afs_has_cell_expired(cell, &next_manage))
+		goto remove_cell;
 
-	case AFS_CELL_REMOVED:
-		goto done;
+	if (test_and_clear_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) {
+		ret = afs_update_cell(cell);
+		if (ret < 0)
+			cell->error = ret;
+		if (cell->state == AFS_CELL_UNLOOKED)
+			afs_set_cell_state(cell, AFS_CELL_ACTIVE);
+	}
 
-	default:
-		break;
+	if (next_manage < TIME64_MAX && cell->net->live) {
+		time64_t now = ktime_get_real_seconds();
+
+		if (next_manage - now <= 0)
+			afs_queue_cell(cell, afs_cell_trace_queue_again);
+		else
+			afs_set_cell_timer(cell, next_manage - now);
 	}
-	_debug("bad state %u", cell->state);
-	BUG(); /* Unhandled state */
+	_leave(" [done %u]", cell->state);
+	return false;
 
-activation_failed:
-	cell->error = ret;
-	afs_deactivate_cell(net, cell);
+remove_cell:
+	down_write(&net->cells_lock);
 
-	smp_store_release(&cell->state, AFS_CELL_FAILED); /* vs error */
-	wake_up_var(&cell->state);
-	goto again;
+	if (atomic_read(&cell->active)) {
+		up_write(&net->cells_lock);
+		goto cell_is_active;
+	}
 
-reverse_deactivation:
-	smp_store_release(&cell->state, AFS_CELL_ACTIVE);
-	wake_up_var(&cell->state);
-	_leave(" [deact->act]");
-	return;
+	/* Make sure that the expiring server records are going to see the fact
+	 * that the cell is caput.
+	 */
+	afs_set_cell_state(cell, AFS_CELL_REMOVING);
 
-done:
-	_leave(" [done %u]", cell->state);
-	return;
+	afs_deactivate_cell(net, cell);
+	afs_purge_servers(cell);
+
+	rb_erase(&cell->net_node, &net->cells);
+	afs_see_cell(cell, afs_cell_trace_unuse_delete);
+	up_write(&net->cells_lock);
 
-final_destruction:
 	/* The root volume is pinning the cell */
 	afs_put_volume(cell->root_volume, afs_volume_trace_put_cell_root);
 	cell->root_volume = NULL;
-	afs_put_cell(cell, afs_cell_trace_put_destroy);
+
+	afs_set_cell_state(cell, AFS_CELL_DEAD);
+	return true;
 }
 
 static void afs_manage_cell_work(struct work_struct *work)
 {
 	struct afs_cell *cell = container_of(work, struct afs_cell, manager);
+	bool final_put;
 
-	afs_manage_cell(cell);
-	afs_put_cell(cell, afs_cell_trace_put_queue_work);
+	afs_see_cell(cell, afs_cell_trace_manage);
+	final_put = afs_manage_cell(cell);
+	afs_see_cell(cell, afs_cell_trace_managed);
+	if (final_put)
+		afs_put_cell(cell, afs_cell_trace_put_final);
 }
 
 /*
- * Manage the records of cells known to a network namespace.  This includes
- * updating the DNS records and garbage collecting unused cells that were
- * automatically added.
- *
- * Note that constructed cell records may only be removed from net->cells by
- * this work item, so it is safe for this work item to stash a cursor pointing
- * into the tree and then return to caller (provided it skips cells that are
- * still under construction).
- *
- * Note also that we were given an increment on net->cells_outstanding by
- * whoever queued us that we need to deal with before returning.
+ * Purge in-memory cell database.
  */
-void afs_manage_cells(struct work_struct *work)
+void afs_cell_purge(struct afs_net *net)
 {
-	struct afs_net *net = container_of(work, struct afs_net, cells_manager);
+	struct afs_cell *ws;
 	struct rb_node *cursor;
-	time64_t now = ktime_get_real_seconds(), next_manage = TIME64_MAX;
-	bool purging = !net->live;
 
 	_enter("");
 
-	/* Trawl the cell database looking for cells that have expired from
-	 * lack of use and cells whose DNS results have expired and dispatch
-	 * their managers.
-	 */
-	down_read(&net->cells_lock);
+	down_write(&net->cells_lock);
+	ws = rcu_replace_pointer(net->ws_cell, NULL,
+				 lockdep_is_held(&net->cells_lock));
+	up_write(&net->cells_lock);
+	afs_unuse_cell(ws, afs_cell_trace_unuse_ws);
 
+	_debug("kick cells");
+	down_read(&net->cells_lock);
 	for (cursor = rb_first(&net->cells); cursor; cursor = rb_next(cursor)) {
-		struct afs_cell *cell =
-			rb_entry(cursor, struct afs_cell, net_node);
-		unsigned active;
-		bool sched_cell = false;
-
-		active = atomic_read(&cell->active);
-		trace_afs_cell(cell->debug_id, refcount_read(&cell->ref),
-			       active, afs_cell_trace_manage);
-
-		ASSERTCMP(active, >=, 1);
-
-		if (purging) {
-			if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) {
-				active = atomic_dec_return(&cell->active);
-				trace_afs_cell(cell->debug_id, refcount_read(&cell->ref),
-					       active, afs_cell_trace_unuse_pin);
-			}
-		}
+		struct afs_cell *cell = rb_entry(cursor, struct afs_cell, net_node);
 
-		if (active == 1) {
-			struct afs_vlserver_list *vllist;
-			time64_t expire_at = cell->last_inactive;
-
-			read_lock(&cell->vl_servers_lock);
-			vllist = rcu_dereference_protected(
-				cell->vl_servers,
-				lockdep_is_held(&cell->vl_servers_lock));
-			if (vllist->nr_servers > 0)
-				expire_at += afs_cell_gc_delay;
-			read_unlock(&cell->vl_servers_lock);
-			if (purging || expire_at <= now)
-				sched_cell = true;
-			else if (expire_at < next_manage)
-				next_manage = expire_at;
-		}
+		afs_see_cell(cell, afs_cell_trace_purge);
 
-		if (!purging) {
-			if (test_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags))
-				sched_cell = true;
-		}
+		if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags))
+			afs_unuse_cell(cell, afs_cell_trace_unuse_pin);
 
-		if (sched_cell)
-			afs_queue_cell(cell, afs_cell_trace_get_queue_manage);
+		afs_queue_cell(cell, afs_cell_trace_queue_purge);
 	}
-
 	up_read(&net->cells_lock);
 
-	/* Update the timer on the way out.  We have to pass an increment on
-	 * cells_outstanding in the namespace that we are in to the timer or
-	 * the work scheduler.
-	 */
-	if (!purging && next_manage < TIME64_MAX) {
-		now = ktime_get_real_seconds();
-
-		if (next_manage - now <= 0) {
-			if (queue_work(afs_wq, &net->cells_manager))
-				atomic_inc(&net->cells_outstanding);
-		} else {
-			afs_set_cell_timer(net, next_manage - now);
-		}
-	}
-
-	afs_dec_cells_outstanding(net);
-	_leave(" [%d]", atomic_read(&net->cells_outstanding));
-}
-
-/*
- * Purge in-memory cell database.
- */
-void afs_cell_purge(struct afs_net *net)
-{
-	struct afs_cell *ws;
-
-	_enter("");
-
-	down_write(&net->cells_lock);
-	ws = net->ws_cell;
-	net->ws_cell = NULL;
-	up_write(&net->cells_lock);
-	afs_unuse_cell(net, ws, afs_cell_trace_unuse_ws);
-
-	_debug("del timer");
-	if (del_timer_sync(&net->cells_timer))
-		atomic_dec(&net->cells_outstanding);
-
-	_debug("kick mgr");
-	afs_queue_cell_manager(net);
-
 	_debug("wait");
 	wait_var_event(&net->cells_outstanding,
 		       !atomic_read(&net->cells_outstanding));
diff --git a/fs/afs/cm_security.c b/fs/afs/cm_security.c
new file mode 100644
index 000000000000..edcbd249d202
--- /dev/null
+++ b/fs/afs/cm_security.c
@@ -0,0 +1,340 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Cache manager security.
+ *
+ * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/slab.h>
+#include <crypto/krb5.h>
+#include "internal.h"
+#include "afs_cm.h"
+#include "afs_fs.h"
+#include "protocol_yfs.h"
+#define RXRPC_TRACE_ONLY_DEFINE_ENUMS
+#include <trace/events/rxrpc.h>
+
+#define RXGK_SERVER_ENC_TOKEN 1036U // 0x40c
+#define xdr_round_up(x) (round_up((x), sizeof(__be32)))
+#define xdr_len_object(x) (4 + round_up((x), sizeof(__be32)))
+
+#ifdef CONFIG_RXGK
+static int afs_create_yfs_cm_token(struct sk_buff *challenge,
+				   struct afs_server *server);
+#endif
+
+/*
+ * Respond to an RxGK challenge, adding appdata.
+ */
+static int afs_respond_to_challenge(struct sk_buff *challenge)
+{
+#ifdef CONFIG_RXGK
+	struct krb5_buffer appdata = {};
+	struct afs_server *server;
+#endif
+	struct rxrpc_peer *peer;
+	unsigned long peer_data;
+	u16 service_id;
+	u8 security_index;
+
+	rxrpc_kernel_query_challenge(challenge, &peer, &peer_data,
+				     &service_id, &security_index);
+
+	_enter("%u,%u", service_id, security_index);
+
+	switch (service_id) {
+		/* We don't send CM_SERVICE RPCs, so don't expect a challenge
+		 * therefrom.
+		 */
+	case FS_SERVICE:
+	case VL_SERVICE:
+	case YFS_FS_SERVICE:
+	case YFS_VL_SERVICE:
+		break;
+	default:
+		pr_warn("Can't respond to unknown challenge %u:%u",
+			service_id, security_index);
+		return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO,
+						     afs_abort_unsupported_sec_class);
+	}
+
+	switch (security_index) {
+#ifdef CONFIG_RXKAD
+	case RXRPC_SECURITY_RXKAD:
+		return rxkad_kernel_respond_to_challenge(challenge);
+#endif
+
+#ifdef CONFIG_RXGK
+	case RXRPC_SECURITY_RXGK:
+		return rxgk_kernel_respond_to_challenge(challenge, &appdata);
+
+	case RXRPC_SECURITY_YFS_RXGK:
+		switch (service_id) {
+		case FS_SERVICE:
+		case YFS_FS_SERVICE:
+			server = (struct afs_server *)peer_data;
+			if (!server->cm_rxgk_appdata.data) {
+				mutex_lock(&server->cm_token_lock);
+				if (!server->cm_rxgk_appdata.data)
+					afs_create_yfs_cm_token(challenge, server);
+				mutex_unlock(&server->cm_token_lock);
+			}
+			if (server->cm_rxgk_appdata.data)
+				appdata = server->cm_rxgk_appdata;
+			break;
+		}
+		return rxgk_kernel_respond_to_challenge(challenge, &appdata);
+#endif
+
+	default:
+		return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO,
+						     afs_abort_unsupported_sec_class);
+	}
+}
+
+/*
+ * Process the OOB message queue, processing challenge packets.
+ */
+void afs_process_oob_queue(struct work_struct *work)
+{
+	struct afs_net *net = container_of(work, struct afs_net, rx_oob_work);
+	struct sk_buff *oob;
+	enum rxrpc_oob_type type;
+
+	while ((oob = rxrpc_kernel_dequeue_oob(net->socket, &type))) {
+		switch (type) {
+		case RXRPC_OOB_CHALLENGE:
+			afs_respond_to_challenge(oob);
+			break;
+		}
+		rxrpc_kernel_free_oob(oob);
+	}
+}
+
+#ifdef CONFIG_RXGK
+/*
+ * Create a securities keyring for the cache manager and attach a key to it for
+ * the RxGK tokens we want to use to secure the callback connection back from
+ * the fileserver.
+ */
+int afs_create_token_key(struct afs_net *net, struct socket *socket)
+{
+	const struct krb5_enctype *krb5;
+	struct key *ring;
+	key_ref_t key;
+	char K0[32], *desc;
+	int ret;
+
+	ring = keyring_alloc("kafs",
+			     GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(),
+			     KEY_POS_SEARCH | KEY_POS_WRITE |
+			     KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH,
+			     KEY_ALLOC_NOT_IN_QUOTA,
+			     NULL, NULL);
+	if (IS_ERR(ring))
+		return PTR_ERR(ring);
+
+	ret = rxrpc_sock_set_security_keyring(socket->sk, ring);
+	if (ret < 0)
+		goto out;
+
+	ret = -ENOPKG;
+	krb5 = crypto_krb5_find_enctype(KRB5_ENCTYPE_AES128_CTS_HMAC_SHA1_96);
+	if (!krb5)
+		goto out;
+
+	if (WARN_ON_ONCE(krb5->key_len > sizeof(K0)))
+		goto out;
+
+	ret = -ENOMEM;
+	desc = kasprintf(GFP_KERNEL, "%u:%u:%u:%u",
+			 YFS_CM_SERVICE, RXRPC_SECURITY_YFS_RXGK, 1, krb5->etype);
+	if (!desc)
+		goto out;
+
+	wait_for_random_bytes();
+	get_random_bytes(K0, krb5->key_len);
+
+	key = key_create(make_key_ref(ring, true),
+			 "rxrpc_s", desc,
+			 K0, krb5->key_len,
+			 KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | KEY_USR_VIEW,
+			 KEY_ALLOC_NOT_IN_QUOTA);
+	kfree(desc);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto out;
+	}
+
+	net->fs_cm_token_key = key_ref_to_ptr(key);
+	ret = 0;
+out:
+	key_put(ring);
+	return ret;
+}
+
+/*
+ * Create an YFS RxGK GSS token to use as a ticket to the specified fileserver.
+ */
+static int afs_create_yfs_cm_token(struct sk_buff *challenge,
+				   struct afs_server *server)
+{
+	const struct krb5_enctype *conn_krb5, *token_krb5;
+	const struct krb5_buffer *token_key;
+	struct crypto_aead *aead;
+	struct scatterlist sg;
+	struct afs_net *net = server->cell->net;
+	const struct key *key = net->fs_cm_token_key;
+	size_t keysize, uuidsize, authsize, toksize, encsize, contsize, adatasize, offset;
+	__be32 caps[1] = {
+		[0] = htonl(AFS_CAP_ERROR_TRANSLATION),
+	};
+	__be32 *xdr;
+	void *appdata, *K0, *encbase;
+	u32 enctype;
+	int ret;
+
+	if (!key)
+		return -ENOKEY;
+
+	/* Assume that the fileserver is happy to use the same encoding type as
+	 * we were told to use by the token obtained by the user.
+	 */
+	enctype = rxgk_kernel_query_challenge(challenge);
+
+	conn_krb5 = crypto_krb5_find_enctype(enctype);
+	if (!conn_krb5)
+		return -ENOPKG;
+	token_krb5 = key->payload.data[0];
+	token_key = (const struct krb5_buffer *)&key->payload.data[2];
+
+	/* struct rxgk_key {
+	 *	afs_uint32	enctype;
+	 *	opaque		key<>;
+	 * };
+	 */
+	keysize = 4 + xdr_len_object(conn_krb5->key_len);
+
+	/* struct RXGK_AuthName {
+	 *	afs_int32	kind;
+	 *	opaque		data<AUTHDATAMAX>;
+	 *	opaque		display<AUTHPRINTABLEMAX>;
+	 * };
+	 */
+	uuidsize = sizeof(server->uuid);
+	authsize = 4 + xdr_len_object(uuidsize) + xdr_len_object(0);
+
+	/* struct RXGK_Token {
+	 *	rxgk_key		K0;
+	 *	RXGK_Level		level;
+	 *	rxgkTime		starttime;
+	 *	afs_int32		lifetime;
+	 *	afs_int32		bytelife;
+	 *	rxgkTime		expirationtime;
+	 *	struct RXGK_AuthName	identities<>;
+	 * };
+	 */
+	toksize = keysize + 8 + 4 + 4 + 8 + xdr_len_object(authsize);
+
+	offset = 0;
+	encsize = crypto_krb5_how_much_buffer(token_krb5, KRB5_ENCRYPT_MODE, toksize, &offset);
+
+	/* struct RXGK_TokenContainer {
+	 *	afs_int32	kvno;
+	 *	afs_int32	enctype;
+	 *	opaque		encrypted_token<>;
+	 * };
+	 */
+	contsize = 4 + 4 + xdr_len_object(encsize);
+
+	/* struct YFSAppData {
+	 *	opr_uuid	initiatorUuid;
+	 *	opr_uuid	acceptorUuid;
+	 *	Capabilities	caps;
+	 *	afs_int32	enctype;
+	 *	opaque		callbackKey<>;
+	 *	opaque		callbackToken<>;
+	 * };
+	 */
+	adatasize = 16 + 16 +
+		xdr_len_object(sizeof(caps)) +
+		4 +
+		xdr_len_object(conn_krb5->key_len) +
+		xdr_len_object(contsize);
+
+	ret = -ENOMEM;
+	appdata = kzalloc(adatasize, GFP_KERNEL);
+	if (!appdata)
+		goto out;
+	xdr = appdata;
+
+	memcpy(xdr, &net->uuid, 16);		/* appdata.initiatorUuid */
+	xdr += 16 / 4;
+	memcpy(xdr, &server->uuid, 16);		/* appdata.acceptorUuid */
+	xdr += 16 / 4;
+	*xdr++ = htonl(ARRAY_SIZE(caps));	/* appdata.caps.len */
+	memcpy(xdr, &caps, sizeof(caps));	/* appdata.caps */
+	xdr += ARRAY_SIZE(caps);
+	*xdr++ = htonl(conn_krb5->etype);	/* appdata.enctype */
+
+	*xdr++ = htonl(conn_krb5->key_len);	/* appdata.callbackKey.len */
+	K0 = xdr;
+	get_random_bytes(K0, conn_krb5->key_len); /* appdata.callbackKey.data */
+	xdr += xdr_round_up(conn_krb5->key_len) / 4;
+
+	*xdr++ = htonl(contsize);		/* appdata.callbackToken.len */
+	*xdr++ = htonl(1);			/* cont.kvno */
+	*xdr++ = htonl(token_krb5->etype);	/* cont.enctype */
+	*xdr++ = htonl(encsize);		/* cont.encrypted_token.len */
+
+	encbase = xdr;
+	xdr += offset / 4;
+	*xdr++ = htonl(conn_krb5->etype);	/* token.K0.enctype */
+	*xdr++ = htonl(conn_krb5->key_len);	/* token.K0.key.len */
+	memcpy(xdr, K0, conn_krb5->key_len);	/* token.K0.key.data */
+	xdr += xdr_round_up(conn_krb5->key_len) / 4;
+
+	*xdr++ = htonl(RXRPC_SECURITY_ENCRYPT);	/* token.level */
+	*xdr++ = htonl(0);			/* token.starttime */
+	*xdr++ = htonl(0);			/* " */
+	*xdr++ = htonl(0);			/* token.lifetime */
+	*xdr++ = htonl(0);			/* token.bytelife */
+	*xdr++ = htonl(0);			/* token.expirationtime */
+	*xdr++ = htonl(0);			/* " */
+	*xdr++ = htonl(1);			/* token.identities.count */
+	*xdr++ = htonl(0);			/* token.identities[0].kind */
+	*xdr++ = htonl(uuidsize);		/* token.identities[0].data.len */
+	memcpy(xdr, &server->uuid, uuidsize);
+	xdr += xdr_round_up(uuidsize) / 4;
+	*xdr++ = htonl(0);			/* token.identities[0].display.len */
+
+	xdr = encbase + xdr_round_up(encsize);
+
+	if ((unsigned long)xdr - (unsigned long)appdata != adatasize)
+		pr_err("Appdata size incorrect %lx != %zx\n",
+		       (unsigned long)xdr - (unsigned long)appdata, adatasize);
+
+	aead = crypto_krb5_prepare_encryption(token_krb5, token_key, RXGK_SERVER_ENC_TOKEN,
+					      GFP_KERNEL);
+	if (IS_ERR(aead)) {
+		ret = PTR_ERR(aead);
+		goto out_token;
+	}
+
+	sg_init_one(&sg, encbase, encsize);
+	ret = crypto_krb5_encrypt(token_krb5, aead, &sg, 1, encsize, offset, toksize, false);
+	if (ret < 0)
+		goto out_aead;
+
+	server->cm_rxgk_appdata.len  = adatasize;
+	server->cm_rxgk_appdata.data = appdata;
+	appdata = NULL;
+
+out_aead:
+	crypto_free_aead(aead);
+out_token:
+	kfree(appdata);
+out:
+	return ret;
+}
+#endif /* CONFIG_RXGK */
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 99a3f20bc786..1a906805a9e3 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -139,49 +139,6 @@ bool afs_cm_incoming_call(struct afs_call *call)
 }
 
 /*
- * Find the server record by peer address and record a probe to the cache
- * manager from a server.
- */
-static int afs_find_cm_server_by_peer(struct afs_call *call)
-{
-	struct sockaddr_rxrpc srx;
-	struct afs_server *server;
-	struct rxrpc_peer *peer;
-
-	peer = rxrpc_kernel_get_call_peer(call->net->socket, call->rxcall);
-
-	server = afs_find_server(call->net, peer);
-	if (!server) {
-		trace_afs_cm_no_server(call, &srx);
-		return 0;
-	}
-
-	call->server = server;
-	return 0;
-}
-
-/*
- * Find the server record by server UUID and record a probe to the cache
- * manager from a server.
- */
-static int afs_find_cm_server_by_uuid(struct afs_call *call,
-				      struct afs_uuid *uuid)
-{
-	struct afs_server *server;
-
-	rcu_read_lock();
-	server = afs_find_server_by_uuid(call->net, call->request);
-	rcu_read_unlock();
-	if (!server) {
-		trace_afs_cm_no_server_u(call, call->request);
-		return 0;
-	}
-
-	call->server = server;
-	return 0;
-}
-
-/*
  * Clean up a cache manager call.
  */
 static void afs_cm_destructor(struct afs_call *call)
@@ -322,10 +279,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
 
 	if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
 		return afs_io_error(call, afs_io_error_cm_reply);
-
-	/* we'll need the file server record as that tells us which set of
-	 * vnodes to operate upon */
-	return afs_find_cm_server_by_peer(call);
+	return 0;
 }
 
 /*
@@ -349,18 +303,10 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work)
  */
 static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
 {
-	int ret;
-
 	_enter("");
 
 	afs_extract_discard(call, 0);
-	ret = afs_extract_data(call, false);
-	if (ret < 0)
-		return ret;
-
-	/* we'll need the file server record as that tells us which set of
-	 * vnodes to operate upon */
-	return afs_find_cm_server_by_peer(call);
+	return afs_extract_data(call, false);
 }
 
 /*
@@ -373,8 +319,6 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
 	__be32 *b;
 	int ret;
 
-	_enter("");
-
 	_enter("{%u}", call->unmarshall);
 
 	switch (call->unmarshall) {
@@ -421,9 +365,13 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
 	if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
 		return afs_io_error(call, afs_io_error_cm_reply);
 
-	/* we'll need the file server record as that tells us which set of
-	 * vnodes to operate upon */
-	return afs_find_cm_server_by_uuid(call, call->request);
+	if (memcmp(call->request, &call->server->_uuid, sizeof(call->server->_uuid)) != 0) {
+		pr_notice("Callback UUID does not match fileserver UUID\n");
+		trace_afs_cm_no_server_u(call, call->request);
+		return 0;
+	}
+
+	return 0;
 }
 
 /*
@@ -455,7 +403,7 @@ static int afs_deliver_cb_probe(struct afs_call *call)
 
 	if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
 		return afs_io_error(call, afs_io_error_cm_reply);
-	return afs_find_cm_server_by_peer(call);
+	return 0;
 }
 
 /*
@@ -533,7 +481,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
 
 	if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
 		return afs_io_error(call, afs_io_error_cm_reply);
-	return afs_find_cm_server_by_peer(call);
+	return 0;
 }
 
 /*
@@ -593,7 +541,7 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call)
 
 	if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
 		return afs_io_error(call, afs_io_error_cm_reply);
-	return afs_find_cm_server_by_peer(call);
+	return 0;
 }
 
 /*
@@ -667,9 +615,5 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call)
 
 	if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
 		return afs_io_error(call, afs_io_error_cm_reply);
-
-	/* We'll need the file server record as that tells us which set of
-	 * vnodes to operate upon.
-	 */
-	return afs_find_cm_server_by_peer(call);
+	return 0;
 }
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 67afe68972d5..f4e9e12373ac 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -12,6 +12,8 @@
 #include <linux/swap.h>
 #include <linux/ctype.h>
 #include <linux/sched.h>
+#include <linux/iversion.h>
+#include <linux/iov_iter.h>
 #include <linux/task_io_accounting_ops.h>
 #include "internal.h"
 #include "afs_fs.h"
@@ -21,7 +23,8 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 				 unsigned int flags);
 static int afs_dir_open(struct inode *inode, struct file *file);
 static int afs_readdir(struct file *file, struct dir_context *ctx);
-static int afs_d_revalidate(struct dentry *dentry, unsigned int flags);
+static int afs_d_revalidate(struct inode *dir, const struct qstr *name,
+			    struct dentry *dentry, unsigned int flags);
 static int afs_d_delete(const struct dentry *dentry);
 static void afs_d_iput(struct dentry *dentry, struct inode *inode);
 static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen,
@@ -30,8 +33,8 @@ static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, int nl
 			      loff_t fpos, u64 ino, unsigned dtype);
 static int afs_create(struct mnt_idmap *idmap, struct inode *dir,
 		      struct dentry *dentry, umode_t mode, bool excl);
-static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
-		     struct dentry *dentry, umode_t mode);
+static struct dentry *afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+				struct dentry *dentry, umode_t mode);
 static int afs_rmdir(struct inode *dir, struct dentry *dentry);
 static int afs_unlink(struct inode *dir, struct dentry *dentry);
 static int afs_link(struct dentry *from, struct inode *dir,
@@ -41,15 +44,6 @@ static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 		      struct dentry *old_dentry, struct inode *new_dir,
 		      struct dentry *new_dentry, unsigned int flags);
-static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags);
-static void afs_dir_invalidate_folio(struct folio *folio, size_t offset,
-				   size_t length);
-
-static bool afs_dir_dirty_folio(struct address_space *mapping,
-		struct folio *folio)
-{
-	BUG(); /* This should never happen. */
-}
 
 const struct file_operations afs_dir_file_operations = {
 	.open		= afs_dir_open,
@@ -74,10 +68,7 @@ const struct inode_operations afs_dir_inode_operations = {
 };
 
 const struct address_space_operations afs_dir_aops = {
-	.dirty_folio	= afs_dir_dirty_folio,
-	.release_folio	= afs_dir_release_folio,
-	.invalidate_folio = afs_dir_invalidate_folio,
-	.migrate_folio	= filemap_migrate_folio,
+	.writepages	= afs_single_writepages,
 };
 
 const struct dentry_operations afs_fs_dentry_operations = {
@@ -98,152 +89,124 @@ struct afs_lookup_one_cookie {
 struct afs_lookup_cookie {
 	struct dir_context	ctx;
 	struct qstr		name;
-	bool			found;
-	bool			one_only;
 	unsigned short		nr_fids;
 	struct afs_fid		fids[50];
 };
 
+static void afs_dir_unuse_cookie(struct afs_vnode *dvnode, int ret)
+{
+	if (ret == 0) {
+		struct afs_vnode_cache_aux aux;
+		loff_t i_size = i_size_read(&dvnode->netfs.inode);
+
+		afs_set_cache_aux(dvnode, &aux);
+		fscache_unuse_cookie(afs_vnode_cache(dvnode), &aux, &i_size);
+	} else {
+		fscache_unuse_cookie(afs_vnode_cache(dvnode), NULL, NULL);
+	}
+}
+
 /*
- * Drop the refs that we're holding on the folios we were reading into.  We've
- * got refs on the first nr_pages pages.
+ * Iterate through a kmapped directory segment, dumping a summary of
+ * the contents.
  */
-static void afs_dir_read_cleanup(struct afs_read *req)
+static size_t afs_dir_dump_step(void *iter_base, size_t progress, size_t len,
+				void *priv, void *priv2)
 {
-	struct address_space *mapping = req->vnode->netfs.inode.i_mapping;
-	struct folio *folio;
-	pgoff_t last = req->nr_pages - 1;
+	do {
+		union afs_xdr_dir_block *block = iter_base;
 
-	XA_STATE(xas, &mapping->i_pages, 0);
+		pr_warn("[%05zx] %32phN\n", progress, block);
+		iter_base += AFS_DIR_BLOCK_SIZE;
+		progress += AFS_DIR_BLOCK_SIZE;
+		len -= AFS_DIR_BLOCK_SIZE;
+	} while (len > 0);
 
-	if (unlikely(!req->nr_pages))
-		return;
+	return len;
+}
 
-	rcu_read_lock();
-	xas_for_each(&xas, folio, last) {
-		if (xas_retry(&xas, folio))
-			continue;
-		BUG_ON(xa_is_value(folio));
-		ASSERTCMP(folio->mapping, ==, mapping);
+/*
+ * Dump the contents of a directory.
+ */
+static void afs_dir_dump(struct afs_vnode *dvnode)
+{
+	struct iov_iter iter;
+	unsigned long long i_size = i_size_read(&dvnode->netfs.inode);
 
-		folio_put(folio);
-	}
+	pr_warn("DIR %llx:%llx is=%llx\n",
+		dvnode->fid.vid, dvnode->fid.vnode, i_size);
 
-	rcu_read_unlock();
+	iov_iter_folio_queue(&iter, ITER_SOURCE, dvnode->directory, 0, 0, i_size);
+	iterate_folioq(&iter, iov_iter_count(&iter), NULL, NULL,
+		       afs_dir_dump_step);
 }
 
 /*
  * check that a directory folio is valid
  */
-static bool afs_dir_check_folio(struct afs_vnode *dvnode, struct folio *folio,
-				loff_t i_size)
+static bool afs_dir_check_block(struct afs_vnode *dvnode, size_t progress,
+				union afs_xdr_dir_block *block)
 {
-	union afs_xdr_dir_block *block;
-	size_t offset, size;
-	loff_t pos;
+	if (block->hdr.magic != AFS_DIR_MAGIC) {
+		pr_warn("%s(%lx): [%zx] bad magic %04x\n",
+		       __func__, dvnode->netfs.inode.i_ino,
+		       progress, ntohs(block->hdr.magic));
+		trace_afs_dir_check_failed(dvnode, progress);
+		trace_afs_file_error(dvnode, -EIO, afs_file_error_dir_bad_magic);
+		return false;
+	}
 
-	/* Determine how many magic numbers there should be in this folio, but
-	 * we must take care because the directory may change size under us.
+	/* Make sure each block is NUL terminated so we can reasonably
+	 * use string functions on it.  The filenames in the folio
+	 * *should* be NUL-terminated anyway.
 	 */
-	pos = folio_pos(folio);
-	if (i_size <= pos)
-		goto checked;
-
-	size = min_t(loff_t, folio_size(folio), i_size - pos);
-	for (offset = 0; offset < size; offset += sizeof(*block)) {
-		block = kmap_local_folio(folio, offset);
-		if (block->hdr.magic != AFS_DIR_MAGIC) {
-			printk("kAFS: %s(%lx): [%llx] bad magic %zx/%zx is %04hx\n",
-			       __func__, dvnode->netfs.inode.i_ino,
-			       pos, offset, size, ntohs(block->hdr.magic));
-			trace_afs_dir_check_failed(dvnode, pos + offset, i_size);
-			kunmap_local(block);
-			trace_afs_file_error(dvnode, -EIO, afs_file_error_dir_bad_magic);
-			goto error;
-		}
-
-		/* Make sure each block is NUL terminated so we can reasonably
-		 * use string functions on it.  The filenames in the folio
-		 * *should* be NUL-terminated anyway.
-		 */
-		((u8 *)block)[AFS_DIR_BLOCK_SIZE - 1] = 0;
-
-		kunmap_local(block);
-	}
-checked:
+	((u8 *)block)[AFS_DIR_BLOCK_SIZE - 1] = 0;
 	afs_stat_v(dvnode, n_read_dir);
 	return true;
-
-error:
-	return false;
 }
 
 /*
- * Dump the contents of a directory.
+ * Iterate through a kmapped directory segment, checking the content.
  */
-static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req)
+static size_t afs_dir_check_step(void *iter_base, size_t progress, size_t len,
+				 void *priv, void *priv2)
 {
-	union afs_xdr_dir_block *block;
-	struct address_space *mapping = dvnode->netfs.inode.i_mapping;
-	struct folio *folio;
-	pgoff_t last = req->nr_pages - 1;
-	size_t offset, size;
-
-	XA_STATE(xas, &mapping->i_pages, 0);
-
-	pr_warn("DIR %llx:%llx f=%llx l=%llx al=%llx\n",
-		dvnode->fid.vid, dvnode->fid.vnode,
-		req->file_size, req->len, req->actual_len);
-	pr_warn("DIR %llx %x %zx %zx\n",
-		req->pos, req->nr_pages,
-		req->iter->iov_offset,  iov_iter_count(req->iter));
-
-	xas_for_each(&xas, folio, last) {
-		if (xas_retry(&xas, folio))
-			continue;
+	struct afs_vnode *dvnode = priv;
 
-		BUG_ON(folio->mapping != mapping);
+	if (WARN_ON_ONCE(progress % AFS_DIR_BLOCK_SIZE ||
+			 len % AFS_DIR_BLOCK_SIZE))
+		return len;
 
-		size = min_t(loff_t, folio_size(folio), req->actual_len - folio_pos(folio));
-		for (offset = 0; offset < size; offset += sizeof(*block)) {
-			block = kmap_local_folio(folio, offset);
-			pr_warn("[%02lx] %32phN\n", folio->index + offset, block);
-			kunmap_local(block);
-		}
-	}
+	do {
+		if (!afs_dir_check_block(dvnode, progress, iter_base))
+			break;
+		iter_base += AFS_DIR_BLOCK_SIZE;
+		len -= AFS_DIR_BLOCK_SIZE;
+	} while (len > 0);
+
+	return len;
 }
 
 /*
- * Check all the blocks in a directory.  All the folios are held pinned.
+ * Check all the blocks in a directory.
  */
-static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req)
+static int afs_dir_check(struct afs_vnode *dvnode)
 {
-	struct address_space *mapping = dvnode->netfs.inode.i_mapping;
-	struct folio *folio;
-	pgoff_t last = req->nr_pages - 1;
-	int ret = 0;
-
-	XA_STATE(xas, &mapping->i_pages, 0);
+	struct iov_iter iter;
+	unsigned long long i_size = i_size_read(&dvnode->netfs.inode);
+	size_t checked = 0;
 
-	if (unlikely(!req->nr_pages))
+	if (unlikely(!i_size))
 		return 0;
 
-	rcu_read_lock();
-	xas_for_each(&xas, folio, last) {
-		if (xas_retry(&xas, folio))
-			continue;
-
-		BUG_ON(folio->mapping != mapping);
-
-		if (!afs_dir_check_folio(dvnode, folio, req->actual_len)) {
-			afs_dir_dump(dvnode, req);
-			ret = -EIO;
-			break;
-		}
+	iov_iter_folio_queue(&iter, ITER_SOURCE, dvnode->directory, 0, 0, i_size);
+	checked = iterate_folioq(&iter, iov_iter_count(&iter), dvnode, NULL,
+				 afs_dir_check_step);
+	if (checked != i_size) {
+		afs_dir_dump(dvnode);
+		return -EIO;
 	}
-
-	rcu_read_unlock();
-	return ret;
+	return 0;
 }
 
 /*
@@ -263,134 +226,140 @@ static int afs_dir_open(struct inode *inode, struct file *file)
 }
 
 /*
- * Read the directory into the pagecache in one go, scrubbing the previous
- * contents.  The list of folios is returned, pinning them so that they don't
- * get reclaimed during the iteration.
+ * Read a file in a single download.
  */
-static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
-	__acquires(&dvnode->validate_lock)
+static ssize_t afs_do_read_single(struct afs_vnode *dvnode, struct file *file)
 {
-	struct address_space *mapping = dvnode->netfs.inode.i_mapping;
-	struct afs_read *req;
+	struct iov_iter iter;
+	ssize_t ret;
 	loff_t i_size;
-	int nr_pages, i;
-	int ret;
-	loff_t remote_size = 0;
+	bool is_dir = (S_ISDIR(dvnode->netfs.inode.i_mode) &&
+		       !test_bit(AFS_VNODE_MOUNTPOINT, &dvnode->flags));
 
-	_enter("");
-
-	req = kzalloc(sizeof(*req), GFP_KERNEL);
-	if (!req)
-		return ERR_PTR(-ENOMEM);
-
-	refcount_set(&req->usage, 1);
-	req->vnode = dvnode;
-	req->key = key_get(key);
-	req->cleanup = afs_dir_read_cleanup;
-
-expand:
 	i_size = i_size_read(&dvnode->netfs.inode);
-	if (i_size < remote_size)
-	    i_size = remote_size;
-	if (i_size < 2048) {
-		ret = afs_bad(dvnode, afs_file_error_dir_small);
-		goto error;
-	}
-	if (i_size > 2048 * 1024) {
-		trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big);
-		ret = -EFBIG;
-		goto error;
+	if (is_dir) {
+		if (i_size < AFS_DIR_BLOCK_SIZE)
+			return afs_bad(dvnode, afs_file_error_dir_small);
+		if (i_size > AFS_DIR_BLOCK_SIZE * 1024) {
+			trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big);
+			return -EFBIG;
+		}
+	} else {
+		if (i_size > AFSPATHMAX) {
+			trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big);
+			return -EFBIG;
+		}
 	}
 
-	_enter("%llu", i_size);
+	/* Expand the storage.  TODO: Shrink the storage too. */
+	if (dvnode->directory_size < i_size) {
+		size_t cur_size = dvnode->directory_size;
 
-	nr_pages = (i_size + PAGE_SIZE - 1) / PAGE_SIZE;
+		ret = netfs_alloc_folioq_buffer(NULL,
+						&dvnode->directory, &cur_size, i_size,
+						mapping_gfp_mask(dvnode->netfs.inode.i_mapping));
+		dvnode->directory_size = cur_size;
+		if (ret < 0)
+			return ret;
+	}
 
-	req->actual_len = i_size; /* May change */
-	req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */
-	req->data_version = dvnode->status.data_version; /* May change */
-	iov_iter_xarray(&req->def_iter, ITER_DEST, &dvnode->netfs.inode.i_mapping->i_pages,
-			0, i_size);
-	req->iter = &req->def_iter;
+	iov_iter_folio_queue(&iter, ITER_DEST, dvnode->directory, 0, 0, dvnode->directory_size);
 
-	/* Fill in any gaps that we might find where the memory reclaimer has
-	 * been at work and pin all the folios.  If there are any gaps, we will
-	 * need to reread the entire directory contents.
+	/* AFS requires us to perform the read of a directory synchronously as
+	 * a single unit to avoid issues with the directory contents being
+	 * changed between reads.
 	 */
-	i = req->nr_pages;
-	while (i < nr_pages) {
-		struct folio *folio;
-
-		folio = filemap_get_folio(mapping, i);
-		if (IS_ERR(folio)) {
-			if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
-				afs_stat_v(dvnode, n_inval);
-			folio = __filemap_get_folio(mapping,
-						    i, FGP_LOCK | FGP_CREAT,
-						    mapping->gfp_mask);
-			if (IS_ERR(folio)) {
-				ret = PTR_ERR(folio);
-				goto error;
-			}
-			folio_attach_private(folio, (void *)1);
-			folio_unlock(folio);
+	ret = netfs_read_single(&dvnode->netfs.inode, file, &iter);
+	if (ret >= 0) {
+		i_size = i_size_read(&dvnode->netfs.inode);
+		if (i_size > ret) {
+			/* The content has grown, so we need to expand the
+			 * buffer.
+			 */
+			ret = -ESTALE;
+		} else if (is_dir) {
+			int ret2 = afs_dir_check(dvnode);
+
+			if (ret2 < 0)
+				ret = ret2;
+		} else if (i_size < folioq_folio_size(dvnode->directory, 0)) {
+			/* NUL-terminate a symlink. */
+			char *symlink = kmap_local_folio(folioq_folio(dvnode->directory, 0), 0);
+
+			symlink[i_size] = 0;
+			kunmap_local(symlink);
 		}
-
-		req->nr_pages += folio_nr_pages(folio);
-		i += folio_nr_pages(folio);
 	}
 
-	/* If we're going to reload, we need to lock all the pages to prevent
-	 * races.
-	 */
+	return ret;
+}
+
+ssize_t afs_read_single(struct afs_vnode *dvnode, struct file *file)
+{
+	ssize_t ret;
+
+	fscache_use_cookie(afs_vnode_cache(dvnode), false);
+	ret = afs_do_read_single(dvnode, file);
+	fscache_unuse_cookie(afs_vnode_cache(dvnode), NULL, NULL);
+	return ret;
+}
+
+/*
+ * Read the directory into a folio_queue buffer in one go, scrubbing the
+ * previous contents.  We return -ESTALE if the caller needs to call us again.
+ */
+ssize_t afs_read_dir(struct afs_vnode *dvnode, struct file *file)
+	__acquires(&dvnode->validate_lock)
+{
+	ssize_t ret;
+	loff_t i_size;
+
+	i_size = i_size_read(&dvnode->netfs.inode);
+
 	ret = -ERESTARTSYS;
 	if (down_read_killable(&dvnode->validate_lock) < 0)
 		goto error;
 
-	if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
-		goto success;
+	/* We only need to reread the data if it became invalid - or if we
+	 * haven't read it yet.
+	 */
+	if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+	    test_bit(AFS_VNODE_DIR_READ, &dvnode->flags)) {
+		ret = i_size;
+		goto valid;
+	}
 
 	up_read(&dvnode->validate_lock);
 	if (down_write_killable(&dvnode->validate_lock) < 0)
 		goto error;
 
-	if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) {
-		trace_afs_reload_dir(dvnode);
-		ret = afs_fetch_data(dvnode, req);
-		if (ret < 0)
-			goto error_unlock;
-
-		task_io_account_read(PAGE_SIZE * req->nr_pages);
-
-		if (req->len < req->file_size) {
-			/* The content has grown, so we need to expand the
-			 * buffer.
-			 */
-			up_write(&dvnode->validate_lock);
-			remote_size = req->file_size;
-			goto expand;
-		}
+	if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+		afs_invalidate_cache(dvnode, 0);
 
-		/* Validate the data we just read. */
-		ret = afs_dir_check(dvnode, req);
+	if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) ||
+	    !test_bit(AFS_VNODE_DIR_READ, &dvnode->flags)) {
+		trace_afs_reload_dir(dvnode);
+		ret = afs_read_single(dvnode, file);
 		if (ret < 0)
 			goto error_unlock;
 
 		// TODO: Trim excess pages
 
 		set_bit(AFS_VNODE_DIR_VALID, &dvnode->flags);
+		set_bit(AFS_VNODE_DIR_READ, &dvnode->flags);
+	} else {
+		ret = i_size;
 	}
 
 	downgrade_write(&dvnode->validate_lock);
-success:
-	return req;
+valid:
+	return ret;
 
 error_unlock:
 	up_write(&dvnode->validate_lock);
 error:
-	afs_put_read(req);
-	_leave(" = %d", ret);
-	return ERR_PTR(ret);
+	_leave(" = %zd", ret);
+	return ret;
 }
 
 /*
@@ -398,79 +367,69 @@ error:
  */
 static int afs_dir_iterate_block(struct afs_vnode *dvnode,
 				 struct dir_context *ctx,
-				 union afs_xdr_dir_block *block,
-				 unsigned blkoff)
+				 union afs_xdr_dir_block *block)
 {
 	union afs_xdr_dirent *dire;
-	unsigned offset, next, curr, nr_slots;
+	unsigned int blknum, base, hdr, pos, next, nr_slots;
 	size_t nlen;
 	int tmp;
 
-	_enter("%llx,%x", ctx->pos, blkoff);
+	blknum	= ctx->pos / AFS_DIR_BLOCK_SIZE;
+	base	= blknum * AFS_DIR_SLOTS_PER_BLOCK;
+	hdr	= (blknum == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS);
+	pos	= DIV_ROUND_UP(ctx->pos, AFS_DIR_DIRENT_SIZE) - base;
 
-	curr = (ctx->pos - blkoff) / sizeof(union afs_xdr_dirent);
+	_enter("%llx,%x", ctx->pos, blknum);
 
 	/* walk through the block, an entry at a time */
-	for (offset = (blkoff == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS);
-	     offset < AFS_DIR_SLOTS_PER_BLOCK;
-	     offset = next
-	     ) {
+	for (unsigned int slot = hdr; slot < AFS_DIR_SLOTS_PER_BLOCK; slot = next) {
 		/* skip entries marked unused in the bitmap */
-		if (!(block->hdr.bitmap[offset / 8] &
-		      (1 << (offset % 8)))) {
-			_debug("ENT[%zu.%u]: unused",
-			       blkoff / sizeof(union afs_xdr_dir_block), offset);
-			next = offset + 1;
-			if (offset >= curr)
-				ctx->pos = blkoff +
-					next * sizeof(union afs_xdr_dirent);
+		if (!(block->hdr.bitmap[slot / 8] &
+		      (1 << (slot % 8)))) {
+			_debug("ENT[%x]: Unused", base + slot);
+			next = slot + 1;
+			if (next >= pos)
+				ctx->pos = (base + next) * sizeof(union afs_xdr_dirent);
 			continue;
 		}
 
 		/* got a valid entry */
-		dire = &block->dirents[offset];
+		dire = &block->dirents[slot];
 		nlen = strnlen(dire->u.name,
-			       sizeof(*block) -
-			       offset * sizeof(union afs_xdr_dirent));
+			       (unsigned long)(block + 1) - (unsigned long)dire->u.name - 1);
 		if (nlen > AFSNAMEMAX - 1) {
-			_debug("ENT[%zu]: name too long (len %u/%zu)",
-			       blkoff / sizeof(union afs_xdr_dir_block),
-			       offset, nlen);
+			_debug("ENT[%x]: Name too long (len %zx)",
+			       base + slot, nlen);
 			return afs_bad(dvnode, afs_file_error_dir_name_too_long);
 		}
 
-		_debug("ENT[%zu.%u]: %s %zu \"%s\"",
-		       blkoff / sizeof(union afs_xdr_dir_block), offset,
-		       (offset < curr ? "skip" : "fill"),
+		_debug("ENT[%x]: %s %zx \"%s\"",
+		       base + slot, (slot < pos ? "skip" : "fill"),
 		       nlen, dire->u.name);
 
 		nr_slots = afs_dir_calc_slots(nlen);
-		next = offset + nr_slots;
+		next = slot + nr_slots;
 		if (next > AFS_DIR_SLOTS_PER_BLOCK) {
-			_debug("ENT[%zu.%u]:"
-			       " %u extends beyond end dir block"
-			       " (len %zu)",
-			       blkoff / sizeof(union afs_xdr_dir_block),
-			       offset, next, nlen);
+			_debug("ENT[%x]: extends beyond end dir block (len %zx)",
+			       base + slot, nlen);
 			return afs_bad(dvnode, afs_file_error_dir_over_end);
 		}
 
 		/* Check that the name-extension dirents are all allocated */
 		for (tmp = 1; tmp < nr_slots; tmp++) {
-			unsigned int ix = offset + tmp;
-			if (!(block->hdr.bitmap[ix / 8] & (1 << (ix % 8)))) {
-				_debug("ENT[%zu.u]:"
-				       " %u unmarked extension (%u/%u)",
-				       blkoff / sizeof(union afs_xdr_dir_block),
-				       offset, tmp, nr_slots);
+			unsigned int xslot = slot + tmp;
+
+			if (!(block->hdr.bitmap[xslot / 8] & (1 << (xslot % 8)))) {
+				_debug("ENT[%x]: Unmarked extension (%x/%x)",
+				       base + slot, tmp, nr_slots);
 				return afs_bad(dvnode, afs_file_error_dir_unmarked_ext);
 			}
 		}
 
 		/* skip if starts before the current position */
-		if (offset < curr) {
-			if (next > curr)
-				ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent);
+		if (slot < pos) {
+			if (next > pos)
+				ctx->pos = (base + next) * sizeof(union afs_xdr_dirent);
 			continue;
 		}
 
@@ -484,75 +443,110 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode,
 			return 0;
 		}
 
-		ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent);
+		ctx->pos = (base + next) * sizeof(union afs_xdr_dirent);
 	}
 
 	_leave(" = 1 [more]");
 	return 1;
 }
 
+struct afs_dir_iteration_ctx {
+	struct dir_context	*dir_ctx;
+	int			error;
+};
+
 /*
- * iterate through the data blob that lists the contents of an AFS directory
+ * Iterate through a kmapped directory segment.
  */
-static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
-			   struct key *key, afs_dataversion_t *_dir_version)
+static size_t afs_dir_iterate_step(void *iter_base, size_t progress, size_t len,
+				   void *priv, void *priv2)
 {
-	struct afs_vnode *dvnode = AFS_FS_I(dir);
-	union afs_xdr_dir_block *dblock;
-	struct afs_read *req;
-	struct folio *folio;
-	unsigned offset, size;
+	struct afs_dir_iteration_ctx *ctx = priv2;
+	struct afs_vnode *dvnode = priv;
 	int ret;
 
-	_enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos);
-
-	if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) {
-		_leave(" = -ESTALE");
-		return -ESTALE;
+	if (WARN_ON_ONCE(progress % AFS_DIR_BLOCK_SIZE ||
+			 len % AFS_DIR_BLOCK_SIZE)) {
+		pr_err("Mis-iteration prog=%zx len=%zx\n",
+		       progress % AFS_DIR_BLOCK_SIZE,
+		       len % AFS_DIR_BLOCK_SIZE);
+		return len;
 	}
 
-	req = afs_read_dir(dvnode, key);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
-	*_dir_version = req->data_version;
+	do {
+		ret = afs_dir_iterate_block(dvnode, ctx->dir_ctx, iter_base);
+		if (ret != 1)
+			break;
 
-	/* round the file position up to the next entry boundary */
-	ctx->pos += sizeof(union afs_xdr_dirent) - 1;
-	ctx->pos &= ~(sizeof(union afs_xdr_dirent) - 1);
+		ctx->dir_ctx->pos = round_up(ctx->dir_ctx->pos, AFS_DIR_BLOCK_SIZE);
+		iter_base += AFS_DIR_BLOCK_SIZE;
+		len -= AFS_DIR_BLOCK_SIZE;
+	} while (len > 0);
 
-	/* walk through the blocks in sequence */
-	ret = 0;
-	while (ctx->pos < req->actual_len) {
-		/* Fetch the appropriate folio from the directory and re-add it
-		 * to the LRU.  We have all the pages pinned with an extra ref.
-		 */
-		folio = __filemap_get_folio(dir->i_mapping, ctx->pos / PAGE_SIZE,
-					    FGP_ACCESSED, 0);
-		if (IS_ERR(folio)) {
-			ret = afs_bad(dvnode, afs_file_error_dir_missing_page);
-			break;
-		}
+	return len;
+}
 
-		offset = round_down(ctx->pos, sizeof(*dblock)) - folio_file_pos(folio);
-		size = min_t(loff_t, folio_size(folio),
-			     req->actual_len - folio_file_pos(folio));
+/*
+ * Iterate through the directory folios.
+ */
+static int afs_dir_iterate_contents(struct inode *dir, struct dir_context *dir_ctx)
+{
+	struct afs_dir_iteration_ctx ctx = { .dir_ctx = dir_ctx };
+	struct afs_vnode *dvnode = AFS_FS_I(dir);
+	struct iov_iter iter;
+	unsigned long long i_size = i_size_read(dir);
 
-		do {
-			dblock = kmap_local_folio(folio, offset);
-			ret = afs_dir_iterate_block(dvnode, ctx, dblock,
-						    folio_file_pos(folio) + offset);
-			kunmap_local(dblock);
-			if (ret != 1)
-				goto out;
+	/* Round the file position up to the next entry boundary */
+	dir_ctx->pos = round_up(dir_ctx->pos, sizeof(union afs_xdr_dirent));
 
-		} while (offset += sizeof(*dblock), offset < size);
+	if (i_size <= 0 || dir_ctx->pos >= i_size)
+		return 0;
 
-		ret = 0;
-	}
+	iov_iter_folio_queue(&iter, ITER_SOURCE, dvnode->directory, 0, 0, i_size);
+	iov_iter_advance(&iter, round_down(dir_ctx->pos, AFS_DIR_BLOCK_SIZE));
+
+	iterate_folioq(&iter, iov_iter_count(&iter), dvnode, &ctx,
+		       afs_dir_iterate_step);
+
+	if (ctx.error == -ESTALE)
+		afs_invalidate_dir(dvnode, afs_dir_invalid_iter_stale);
+	return ctx.error;
+}
+
+/*
+ * iterate through the data blob that lists the contents of an AFS directory
+ */
+static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
+			   struct file *file, afs_dataversion_t *_dir_version)
+{
+	struct afs_vnode *dvnode = AFS_FS_I(dir);
+	int retry_limit = 100;
+	int ret;
+
+	_enter("{%lu},%llx,,", dir->i_ino, ctx->pos);
+
+	do {
+		if (--retry_limit < 0) {
+			pr_warn("afs_read_dir(): Too many retries\n");
+			ret = -ESTALE;
+			break;
+		}
+		ret = afs_read_dir(dvnode, file);
+		if (ret < 0) {
+			if (ret != -ESTALE)
+				break;
+			if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) {
+				ret = -ESTALE;
+				break;
+			}
+			continue;
+		}
+		*_dir_version = inode_peek_iversion_raw(dir);
+
+		ret = afs_dir_iterate_contents(dir, ctx);
+		up_read(&dvnode->validate_lock);
+	} while (ret == -ESTALE);
 
-out:
-	up_read(&dvnode->validate_lock);
-	afs_put_read(req);
 	_leave(" = %d", ret);
 	return ret;
 }
@@ -564,8 +558,7 @@ static int afs_readdir(struct file *file, struct dir_context *ctx)
 {
 	afs_dataversion_t dir_version;
 
-	return afs_dir_iterate(file_inode(file), ctx, afs_file_key(file),
-			       &dir_version);
+	return afs_dir_iterate(file_inode(file), ctx, file, &dir_version);
 }
 
 /*
@@ -605,22 +598,22 @@ static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name,
  * Do a lookup of a single name in a directory
  * - just returns the FID the dentry name maps to if found
  */
-static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry,
-			     struct afs_fid *fid, struct key *key,
+static int afs_do_lookup_one(struct inode *dir, const struct qstr *name,
+			     struct afs_fid *fid,
 			     afs_dataversion_t *_dir_version)
 {
 	struct afs_super_info *as = dir->i_sb->s_fs_info;
 	struct afs_lookup_one_cookie cookie = {
 		.ctx.actor = afs_lookup_one_filldir,
-		.name = dentry->d_name,
+		.name = *name,
 		.fid.vid = as->volume->vid
 	};
 	int ret;
 
-	_enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry);
+	_enter("{%lu},{%.*s},", dir->i_ino, name->len, name->name);
 
 	/* search the directory */
-	ret = afs_dir_iterate(dir, &cookie.ctx, key, _dir_version);
+	ret = afs_dir_iterate(dir, &cookie.ctx, NULL, _dir_version);
 	if (ret < 0) {
 		_leave(" = %d [iter]", ret);
 		return ret;
@@ -655,19 +648,10 @@ static bool afs_lookup_filldir(struct dir_context *ctx, const char *name,
 	BUILD_BUG_ON(sizeof(union afs_xdr_dir_block) != 2048);
 	BUILD_BUG_ON(sizeof(union afs_xdr_dirent) != 32);
 
-	if (cookie->found) {
-		if (cookie->nr_fids < 50) {
-			cookie->fids[cookie->nr_fids].vnode	= ino;
-			cookie->fids[cookie->nr_fids].unique	= dtype;
-			cookie->nr_fids++;
-		}
-	} else if (cookie->name.len == nlen &&
-		   memcmp(cookie->name.name, name, nlen) == 0) {
-		cookie->fids[1].vnode	= ino;
-		cookie->fids[1].unique	= dtype;
-		cookie->found = 1;
-		if (cookie->one_only)
-			return false;
+	if (cookie->nr_fids < 50) {
+		cookie->fids[cookie->nr_fids].vnode	= ino;
+		cookie->fids[cookie->nr_fids].unique	= dtype;
+		cookie->nr_fids++;
 	}
 
 	return cookie->nr_fids < 50;
@@ -787,8 +771,7 @@ static bool afs_server_supports_ibulk(struct afs_vnode *dvnode)
  * files in one go and create inodes for them.  The inode of the file we were
  * asked for is returned.
  */
-static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
-				   struct key *key)
+static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry)
 {
 	struct afs_lookup_cookie *cookie;
 	struct afs_vnode_param *vp;
@@ -796,6 +779,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
 	struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode;
 	struct inode *inode = NULL, *ti;
 	afs_dataversion_t data_version = READ_ONCE(dvnode->status.data_version);
+	bool supports_ibulk, isnew;
 	long ret;
 	int i;
 
@@ -812,19 +796,19 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
 	cookie->nr_fids = 2; /* slot 1 is saved for the fid we actually want
 			      * and slot 0 for the directory */
 
-	if (!afs_server_supports_ibulk(dvnode))
-		cookie->one_only = true;
-
-	/* search the directory */
-	ret = afs_dir_iterate(dir, &cookie->ctx, key, &data_version);
+	/* Search the directory for the named entry using the hash table... */
+	ret = afs_dir_search(dvnode, &dentry->d_name, &cookie->fids[1], &data_version);
 	if (ret < 0)
 		goto out;
 
-	dentry->d_fsdata = (void *)(unsigned long)data_version;
+	supports_ibulk = afs_server_supports_ibulk(dvnode);
+	if (supports_ibulk) {
+		/* ...then scan linearly from that point for entries to lookup-ahead. */
+		cookie->ctx.pos = (ret + 1) * AFS_DIR_DIRENT_SIZE;
+		afs_dir_iterate(dir, &cookie->ctx, NULL, &data_version);
+	}
 
-	ret = -ENOENT;
-	if (!cookie->found)
-		goto out;
+	dentry->d_fsdata = (void *)(unsigned long)data_version;
 
 	/* Check to see if we already have an inode for the primary fid. */
 	inode = ilookup5(dir->i_sb, cookie->fids[1].vnode,
@@ -866,7 +850,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
 			 * callback counters.
 			 */
 			ti = ilookup5_nowait(dir->i_sb, vp->fid.vnode,
-					     afs_ilookup5_test_by_fid, &vp->fid);
+					     afs_ilookup5_test_by_fid, &vp->fid, &isnew);
 			if (!IS_ERR_OR_NULL(ti)) {
 				vnode = AFS_FS_I(ti);
 				vp->dv_before = vnode->status.data_version;
@@ -883,7 +867,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
 	 * the whole operation.
 	 */
 	afs_op_set_error(op, -ENOTSUPP);
-	if (!cookie->one_only) {
+	if (supports_ibulk) {
 		op->ops = &afs_inline_bulk_status_operation;
 		afs_begin_vnode_operation(op);
 		afs_wait_for_operation(op);
@@ -925,8 +909,7 @@ out:
 /*
  * Look up an entry in a directory with @sys substitution.
  */
-static struct dentry *afs_lookup_atsys(struct inode *dir, struct dentry *dentry,
-				       struct key *key)
+static struct dentry *afs_lookup_atsys(struct inode *dir, struct dentry *dentry)
 {
 	struct afs_sysnames *subs;
 	struct afs_net *net = afs_i2net(dir);
@@ -960,7 +943,7 @@ static struct dentry *afs_lookup_atsys(struct inode *dir, struct dentry *dentry,
 		}
 
 		strcpy(p, name);
-		ret = lookup_one_len(buf, dentry->d_parent, len);
+		ret = lookup_noperm(&QSTR(buf), dentry->d_parent);
 		if (IS_ERR(ret) || d_is_positive(ret))
 			goto out_s;
 		dput(ret);
@@ -974,7 +957,6 @@ out_s:
 	afs_put_sysnames(subs);
 	kfree(buf);
 out_p:
-	key_put(key);
 	return ret;
 }
 
@@ -988,7 +970,6 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 	struct afs_fid fid = {};
 	struct inode *inode;
 	struct dentry *d;
-	struct key *key;
 	int ret;
 
 	_enter("{%llx:%llu},%p{%pd},",
@@ -1006,15 +987,9 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 		return ERR_PTR(-ESTALE);
 	}
 
-	key = afs_request_key(dvnode->volume->cell);
-	if (IS_ERR(key)) {
-		_leave(" = %ld [key]", PTR_ERR(key));
-		return ERR_CAST(key);
-	}
-
-	ret = afs_validate(dvnode, key);
+	ret = afs_validate(dvnode, NULL);
 	if (ret < 0) {
-		key_put(key);
+		afs_dir_unuse_cookie(dvnode, ret);
 		_leave(" = %d [val]", ret);
 		return ERR_PTR(ret);
 	}
@@ -1024,15 +999,13 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 	    dentry->d_name.name[dentry->d_name.len - 3] == 's' &&
 	    dentry->d_name.name[dentry->d_name.len - 2] == 'y' &&
 	    dentry->d_name.name[dentry->d_name.len - 1] == 's')
-		return afs_lookup_atsys(dir, dentry, key);
+		return afs_lookup_atsys(dir, dentry);
 
 	afs_stat_v(dvnode, n_lookup);
-	inode = afs_do_lookup(dir, dentry, key);
-	key_put(key);
+	inode = afs_do_lookup(dir, dentry);
 	if (inode == ERR_PTR(-ENOENT))
-		inode = afs_try_auto_mntpt(dentry, dir);
-
-	if (!IS_ERR_OR_NULL(inode))
+		inode = NULL;
+	else if (!IS_ERR_OR_NULL(inode))
 		fid = AFS_FS_I(inode)->fid;
 
 	_debug("splice %p", dentry->d_inode);
@@ -1050,21 +1023,12 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 /*
  * Check the validity of a dentry under RCU conditions.
  */
-static int afs_d_revalidate_rcu(struct dentry *dentry)
+static int afs_d_revalidate_rcu(struct afs_vnode *dvnode, struct dentry *dentry)
 {
-	struct afs_vnode *dvnode;
-	struct dentry *parent;
-	struct inode *dir;
 	long dir_version, de_version;
 
 	_enter("%p", dentry);
 
-	/* Check the parent directory is still valid first. */
-	parent = READ_ONCE(dentry->d_parent);
-	dir = d_inode_rcu(parent);
-	if (!dir)
-		return -ECHILD;
-	dvnode = AFS_FS_I(dir);
 	if (test_bit(AFS_VNODE_DELETED, &dvnode->flags))
 		return -ECHILD;
 
@@ -1092,11 +1056,11 @@ static int afs_d_revalidate_rcu(struct dentry *dentry)
  * - NOTE! the hit can be a negative hit too, so we can't assume we have an
  *   inode
  */
-static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int afs_d_revalidate(struct inode *parent_dir, const struct qstr *name,
+			    struct dentry *dentry, unsigned int flags)
 {
-	struct afs_vnode *vnode, *dir;
+	struct afs_vnode *vnode, *dir = AFS_FS_I(parent_dir);
 	struct afs_fid fid;
-	struct dentry *parent;
 	struct inode *inode;
 	struct key *key;
 	afs_dataversion_t dir_version, invalid_before;
@@ -1104,7 +1068,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	int ret;
 
 	if (flags & LOOKUP_RCU)
-		return afs_d_revalidate_rcu(dentry);
+		return afs_d_revalidate_rcu(dir, dentry);
 
 	if (d_really_is_positive(dentry)) {
 		vnode = AFS_FS_I(d_inode(dentry));
@@ -1119,14 +1083,9 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	if (IS_ERR(key))
 		key = NULL;
 
-	/* Hold the parent dentry so we can peer at it */
-	parent = dget_parent(dentry);
-	dir = AFS_FS_I(d_inode(parent));
-
 	/* validate the parent directory */
 	ret = afs_validate(dir, key);
 	if (ret == -ERESTARTSYS) {
-		dput(parent);
 		key_put(key);
 		return ret;
 	}
@@ -1154,7 +1113,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	afs_stat_v(dir, n_reval);
 
 	/* search the directory for this vnode */
-	ret = afs_do_lookup_one(&dir->netfs.inode, dentry, &fid, key, &dir_version);
+	ret = afs_do_lookup_one(&dir->netfs.inode, name, &fid, &dir_version);
 	switch (ret) {
 	case 0:
 		/* the filename maps to something */
@@ -1198,22 +1157,19 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 		goto out_valid;
 
 	default:
-		_debug("failed to iterate dir %pd: %d",
-		       parent, ret);
+		_debug("failed to iterate parent %pd2: %d", dentry, ret);
 		goto not_found;
 	}
 
 out_valid:
 	dentry->d_fsdata = (void *)(unsigned long)dir_version;
 out_valid_noupdate:
-	dput(parent);
 	key_put(key);
 	_leave(" = 1 [valid]");
 	return 1;
 
 not_found:
 	_debug("dropping dentry %pd2", dentry);
-	dput(parent);
 	key_put(key);
 
 	_leave(" = 0 [bad]");
@@ -1281,6 +1237,7 @@ void afs_check_for_remote_deletion(struct afs_operation *op)
  */
 static void afs_vnode_new_inode(struct afs_operation *op)
 {
+	struct afs_vnode_param *dvp = &op->file[0];
 	struct afs_vnode_param *vp = &op->file[1];
 	struct afs_vnode *vnode;
 	struct inode *inode;
@@ -1300,6 +1257,10 @@ static void afs_vnode_new_inode(struct afs_operation *op)
 
 	vnode = AFS_FS_I(inode);
 	set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
+	if (S_ISDIR(inode->i_mode))
+		afs_mkdir_init_dir(vnode, dvp->vnode);
+	else if (S_ISLNK(inode->i_mode))
+		afs_init_new_symlink(vnode, op);
 	if (!afs_op_error(op))
 		afs_cache_permit(vnode, op->key, vnode->cb_break, &vp->scb);
 	d_instantiate(op->dentry, inode);
@@ -1316,18 +1277,21 @@ static void afs_create_success(struct afs_operation *op)
 
 static void afs_create_edit_dir(struct afs_operation *op)
 {
+	struct netfs_cache_resources cres = {};
 	struct afs_vnode_param *dvp = &op->file[0];
 	struct afs_vnode_param *vp = &op->file[1];
 	struct afs_vnode *dvnode = dvp->vnode;
 
 	_enter("op=%08x", op->debug_id);
 
+	fscache_begin_write_operation(&cres, afs_vnode_cache(dvnode));
 	down_write(&dvnode->validate_lock);
 	if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
 	    dvnode->status.data_version == dvp->dv_before + dvp->dv_delta)
 		afs_edit_dir_add(dvnode, &op->dentry->d_name, &vp->fid,
 				 op->create.reason);
 	up_write(&dvnode->validate_lock);
+	fscache_end_operation(&cres);
 }
 
 static void afs_create_put(struct afs_operation *op)
@@ -1350,11 +1314,12 @@ static const struct afs_operation_ops afs_mkdir_operation = {
 /*
  * create a directory on an AFS filesystem
  */
-static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
-		     struct dentry *dentry, umode_t mode)
+static struct dentry *afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+				struct dentry *dentry, umode_t mode)
 {
 	struct afs_operation *op;
 	struct afs_vnode *dvnode = AFS_FS_I(dir);
+	int ret;
 
 	_enter("{%llx:%llu},{%pd},%ho",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry, mode);
@@ -1362,9 +1327,11 @@ static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 	op = afs_alloc_operation(NULL, dvnode->volume);
 	if (IS_ERR(op)) {
 		d_drop(dentry);
-		return PTR_ERR(op);
+		return ERR_CAST(op);
 	}
 
+	fscache_use_cookie(afs_vnode_cache(dvnode), true);
+
 	afs_op_set_vnode(op, 0, dvnode);
 	op->file[0].dv_delta = 1;
 	op->file[0].modification = true;
@@ -1374,7 +1341,9 @@ static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 	op->create.reason = afs_edit_dir_for_mkdir;
 	op->mtime	= current_time(dir);
 	op->ops		= &afs_mkdir_operation;
-	return afs_do_sync_operation(op);
+	ret = afs_do_sync_operation(op);
+	afs_dir_unuse_cookie(dvnode, ret);
+	return ERR_PTR(ret);
 }
 
 /*
@@ -1387,8 +1356,8 @@ static void afs_dir_remove_subdir(struct dentry *dentry)
 
 		clear_nlink(&vnode->netfs.inode);
 		set_bit(AFS_VNODE_DELETED, &vnode->flags);
-		atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
-		clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+		afs_clear_cb_promise(vnode, afs_cb_promise_clear_rmdir);
+		afs_invalidate_dir(vnode, afs_dir_invalid_subdir_removed);
 	}
 }
 
@@ -1402,18 +1371,21 @@ static void afs_rmdir_success(struct afs_operation *op)
 
 static void afs_rmdir_edit_dir(struct afs_operation *op)
 {
+	struct netfs_cache_resources cres = {};
 	struct afs_vnode_param *dvp = &op->file[0];
 	struct afs_vnode *dvnode = dvp->vnode;
 
 	_enter("op=%08x", op->debug_id);
 	afs_dir_remove_subdir(op->dentry);
 
+	fscache_begin_write_operation(&cres, afs_vnode_cache(dvnode));
 	down_write(&dvnode->validate_lock);
 	if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
 	    dvnode->status.data_version == dvp->dv_before + dvp->dv_delta)
 		afs_edit_dir_remove(dvnode, &op->dentry->d_name,
 				    afs_edit_dir_for_rmdir);
 	up_write(&dvnode->validate_lock);
+	fscache_end_operation(&cres);
 }
 
 static void afs_rmdir_put(struct afs_operation *op)
@@ -1448,6 +1420,8 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 	if (IS_ERR(op))
 		return PTR_ERR(op);
 
+	fscache_use_cookie(afs_vnode_cache(dvnode), true);
+
 	afs_op_set_vnode(op, 0, dvnode);
 	op->file[0].dv_delta = 1;
 	op->file[0].modification = true;
@@ -1471,10 +1445,18 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 		op->file[1].vnode = vnode;
 	}
 
-	return afs_do_sync_operation(op);
+	ret = afs_do_sync_operation(op);
+
+	/* Not all systems that can host afs servers have ENOTEMPTY. */
+	if (ret == -EEXIST)
+		ret = -ENOTEMPTY;
+out:
+	afs_dir_unuse_cookie(dvnode, ret);
+	return ret;
 
 error:
-	return afs_put_operation(op);
+	ret = afs_put_operation(op);
+	goto out;
 }
 
 /*
@@ -1537,16 +1519,19 @@ static void afs_unlink_success(struct afs_operation *op)
 
 static void afs_unlink_edit_dir(struct afs_operation *op)
 {
+	struct netfs_cache_resources cres = {};
 	struct afs_vnode_param *dvp = &op->file[0];
 	struct afs_vnode *dvnode = dvp->vnode;
 
 	_enter("op=%08x", op->debug_id);
+	fscache_begin_write_operation(&cres, afs_vnode_cache(dvnode));
 	down_write(&dvnode->validate_lock);
 	if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
 	    dvnode->status.data_version == dvp->dv_before + dvp->dv_delta)
 		afs_edit_dir_remove(dvnode, &op->dentry->d_name,
 				    afs_edit_dir_for_unlink);
 	up_write(&dvnode->validate_lock);
+	fscache_end_operation(&cres);
 }
 
 static void afs_unlink_put(struct afs_operation *op)
@@ -1585,6 +1570,8 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
 	if (IS_ERR(op))
 		return PTR_ERR(op);
 
+	fscache_use_cookie(afs_vnode_cache(dvnode), true);
+
 	afs_op_set_vnode(op, 0, dvnode);
 	op->file[0].dv_delta = 1;
 	op->file[0].modification = true;
@@ -1631,10 +1618,10 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
 		afs_wait_for_operation(op);
 	}
 
-	return afs_put_operation(op);
-
 error:
-	return afs_put_operation(op);
+	ret = afs_put_operation(op);
+	afs_dir_unuse_cookie(dvnode, ret);
+	return ret;
 }
 
 static const struct afs_operation_ops afs_create_operation = {
@@ -1668,6 +1655,8 @@ static int afs_create(struct mnt_idmap *idmap, struct inode *dir,
 		goto error;
 	}
 
+	fscache_use_cookie(afs_vnode_cache(dvnode), true);
+
 	afs_op_set_vnode(op, 0, dvnode);
 	op->file[0].dv_delta = 1;
 	op->file[0].modification = true;
@@ -1678,7 +1667,9 @@ static int afs_create(struct mnt_idmap *idmap, struct inode *dir,
 	op->create.reason = afs_edit_dir_for_create;
 	op->mtime	= current_time(dir);
 	op->ops		= &afs_create_operation;
-	return afs_do_sync_operation(op);
+	ret = afs_do_sync_operation(op);
+	afs_dir_unuse_cookie(dvnode, ret);
+	return ret;
 
 error:
 	d_drop(dentry);
@@ -1743,6 +1734,8 @@ static int afs_link(struct dentry *from, struct inode *dir,
 		goto error;
 	}
 
+	fscache_use_cookie(afs_vnode_cache(dvnode), true);
+
 	ret = afs_validate(vnode, op->key);
 	if (ret < 0)
 		goto error_op;
@@ -1758,10 +1751,13 @@ static int afs_link(struct dentry *from, struct inode *dir,
 	op->dentry_2		= from;
 	op->ops			= &afs_link_operation;
 	op->create.reason	= afs_edit_dir_for_link;
-	return afs_do_sync_operation(op);
+	ret = afs_do_sync_operation(op);
+	afs_dir_unuse_cookie(dvnode, ret);
+	return ret;
 
 error_op:
 	afs_put_operation(op);
+	afs_dir_unuse_cookie(dvnode, ret);
 error:
 	d_drop(dentry);
 	_leave(" = %d", ret);
@@ -1805,6 +1801,8 @@ static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 		goto error;
 	}
 
+	fscache_use_cookie(afs_vnode_cache(dvnode), true);
+
 	afs_op_set_vnode(op, 0, dvnode);
 	op->file[0].dv_delta = 1;
 
@@ -1813,7 +1811,9 @@ static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	op->create.reason	= afs_edit_dir_for_symlink;
 	op->create.symlink	= content;
 	op->mtime		= current_time(dir);
-	return afs_do_sync_operation(op);
+	ret = afs_do_sync_operation(op);
+	afs_dir_unuse_cookie(dvnode, ret);
+	return ret;
 
 error:
 	d_drop(dentry);
@@ -1823,6 +1823,9 @@ error:
 
 static void afs_rename_success(struct afs_operation *op)
 {
+	struct afs_vnode *vnode = op->more_files[0].vnode;
+	struct afs_vnode *new_vnode = op->more_files[1].vnode;
+
 	_enter("op=%08x", op->debug_id);
 
 	op->ctime = op->file[0].scb.status.mtime_client;
@@ -1832,10 +1835,46 @@ static void afs_rename_success(struct afs_operation *op)
 		op->ctime = op->file[1].scb.status.mtime_client;
 		afs_vnode_commit_status(op, &op->file[1]);
 	}
+	if (op->more_files[0].scb.have_status)
+		afs_vnode_commit_status(op, &op->more_files[0]);
+	if (op->more_files[1].scb.have_status)
+		afs_vnode_commit_status(op, &op->more_files[1]);
+
+	/* If we're moving a subdir between dirs, we need to update
+	 * its DV counter too as the ".." will be altered.
+	 */
+	if (op->file[0].vnode != op->file[1].vnode) {
+		if (S_ISDIR(vnode->netfs.inode.i_mode)) {
+			u64 new_dv;
+
+			write_seqlock(&vnode->cb_lock);
+
+			new_dv = vnode->status.data_version + 1;
+			trace_afs_set_dv(vnode, new_dv);
+			vnode->status.data_version = new_dv;
+			inode_set_iversion_raw(&vnode->netfs.inode, new_dv);
+
+			write_sequnlock(&vnode->cb_lock);
+		}
+
+		if ((op->rename.rename_flags & RENAME_EXCHANGE) &&
+		    S_ISDIR(new_vnode->netfs.inode.i_mode)) {
+			u64 new_dv;
+
+			write_seqlock(&new_vnode->cb_lock);
+
+			new_dv = new_vnode->status.data_version + 1;
+			new_vnode->status.data_version = new_dv;
+			inode_set_iversion_raw(&new_vnode->netfs.inode, new_dv);
+
+			write_sequnlock(&new_vnode->cb_lock);
+		}
+	}
 }
 
 static void afs_rename_edit_dir(struct afs_operation *op)
 {
+	struct netfs_cache_resources orig_cres = {}, new_cres = {};
 	struct afs_vnode_param *orig_dvp = &op->file[0];
 	struct afs_vnode_param *new_dvp = &op->file[1];
 	struct afs_vnode *orig_dvnode = orig_dvp->vnode;
@@ -1852,6 +1891,10 @@ static void afs_rename_edit_dir(struct afs_operation *op)
 		op->rename.rehash = NULL;
 	}
 
+	fscache_begin_write_operation(&orig_cres, afs_vnode_cache(orig_dvnode));
+	if (new_dvnode != orig_dvnode)
+		fscache_begin_write_operation(&new_cres, afs_vnode_cache(new_dvnode));
+
 	down_write(&orig_dvnode->validate_lock);
 	if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) &&
 	    orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta)
@@ -1873,6 +1916,12 @@ static void afs_rename_edit_dir(struct afs_operation *op)
 				 &vnode->fid, afs_edit_dir_for_rename_2);
 	}
 
+	if (S_ISDIR(vnode->netfs.inode.i_mode) &&
+	    new_dvnode != orig_dvnode &&
+	    test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
+		afs_edit_dir_update(vnode, &dotdot_name, new_dvnode,
+				    afs_edit_dir_for_rename_sub);
+
 	new_inode = d_inode(new_dentry);
 	if (new_inode) {
 		spin_lock(&new_inode->i_lock);
@@ -1885,9 +1934,6 @@ static void afs_rename_edit_dir(struct afs_operation *op)
 
 	/* Now we can update d_fsdata on the dentries to reflect their
 	 * new parent's data_version.
-	 *
-	 * Note that if we ever implement RENAME_EXCHANGE, we'll have
-	 * to update both dentries with opposing dir versions.
 	 */
 	afs_update_dentry_version(op, new_dvp, op->dentry);
 	afs_update_dentry_version(op, new_dvp, op->dentry_2);
@@ -1895,6 +1941,70 @@ static void afs_rename_edit_dir(struct afs_operation *op)
 	d_move(old_dentry, new_dentry);
 
 	up_write(&new_dvnode->validate_lock);
+	fscache_end_operation(&orig_cres);
+	if (new_dvnode != orig_dvnode)
+		fscache_end_operation(&new_cres);
+}
+
+static void afs_rename_exchange_edit_dir(struct afs_operation *op)
+{
+	struct afs_vnode_param *orig_dvp = &op->file[0];
+	struct afs_vnode_param *new_dvp = &op->file[1];
+	struct afs_vnode *orig_dvnode = orig_dvp->vnode;
+	struct afs_vnode *new_dvnode = new_dvp->vnode;
+	struct afs_vnode *old_vnode = op->more_files[0].vnode;
+	struct afs_vnode *new_vnode = op->more_files[1].vnode;
+	struct dentry *old_dentry = op->dentry;
+	struct dentry *new_dentry = op->dentry_2;
+
+	_enter("op=%08x", op->debug_id);
+
+	if (new_dvnode == orig_dvnode) {
+		down_write(&orig_dvnode->validate_lock);
+		if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) &&
+		    orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta) {
+			afs_edit_dir_update(orig_dvnode, &old_dentry->d_name,
+					    new_vnode, afs_edit_dir_for_rename_0);
+			afs_edit_dir_update(orig_dvnode, &new_dentry->d_name,
+					    old_vnode, afs_edit_dir_for_rename_1);
+		}
+
+		d_exchange(old_dentry, new_dentry);
+		up_write(&orig_dvnode->validate_lock);
+	} else {
+		down_write(&orig_dvnode->validate_lock);
+		if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) &&
+		    orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta)
+			afs_edit_dir_update(orig_dvnode, &old_dentry->d_name,
+					    new_vnode, afs_edit_dir_for_rename_0);
+
+		up_write(&orig_dvnode->validate_lock);
+		down_write(&new_dvnode->validate_lock);
+
+		if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags) &&
+		    new_dvnode->status.data_version == new_dvp->dv_before + new_dvp->dv_delta)
+			afs_edit_dir_update(new_dvnode, &new_dentry->d_name,
+					    old_vnode, afs_edit_dir_for_rename_1);
+
+		if (S_ISDIR(old_vnode->netfs.inode.i_mode) &&
+		    test_bit(AFS_VNODE_DIR_VALID, &old_vnode->flags))
+			afs_edit_dir_update(old_vnode, &dotdot_name, new_dvnode,
+					    afs_edit_dir_for_rename_sub);
+
+		if (S_ISDIR(new_vnode->netfs.inode.i_mode) &&
+		    test_bit(AFS_VNODE_DIR_VALID, &new_vnode->flags))
+			afs_edit_dir_update(new_vnode, &dotdot_name, orig_dvnode,
+					    afs_edit_dir_for_rename_sub);
+
+		/* Now we can update d_fsdata on the dentries to reflect their
+		 * new parents' data_version.
+		 */
+		afs_update_dentry_version(op, new_dvp, old_dentry);
+		afs_update_dentry_version(op, orig_dvp, new_dentry);
+
+		d_exchange(old_dentry, new_dentry);
+		up_write(&new_dvnode->validate_lock);
+	}
 }
 
 static void afs_rename_put(struct afs_operation *op)
@@ -1915,6 +2025,32 @@ static const struct afs_operation_ops afs_rename_operation = {
 	.put		= afs_rename_put,
 };
 
+#if 0 /* Autoswitched in yfs_fs_rename_replace(). */
+static const struct afs_operation_ops afs_rename_replace_operation = {
+	.issue_afs_rpc	= NULL,
+	.issue_yfs_rpc	= yfs_fs_rename_replace,
+	.success	= afs_rename_success,
+	.edit_dir	= afs_rename_edit_dir,
+	.put		= afs_rename_put,
+};
+#endif
+
+static const struct afs_operation_ops afs_rename_noreplace_operation = {
+	.issue_afs_rpc	= NULL,
+	.issue_yfs_rpc	= yfs_fs_rename_noreplace,
+	.success	= afs_rename_success,
+	.edit_dir	= afs_rename_edit_dir,
+	.put		= afs_rename_put,
+};
+
+static const struct afs_operation_ops afs_rename_exchange_operation = {
+	.issue_afs_rpc	= NULL,
+	.issue_yfs_rpc	= yfs_fs_rename_exchange,
+	.success	= afs_rename_success,
+	.edit_dir	= afs_rename_exchange_edit_dir,
+	.put		= afs_rename_put,
+};
+
 /*
  * rename a file in an AFS filesystem and/or move it between directories
  */
@@ -1923,10 +2059,10 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 		      struct dentry *new_dentry, unsigned int flags)
 {
 	struct afs_operation *op;
-	struct afs_vnode *orig_dvnode, *new_dvnode, *vnode;
+	struct afs_vnode *orig_dvnode, *new_dvnode, *vnode, *new_vnode = NULL;
 	int ret;
 
-	if (flags)
+	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
 		return -EINVAL;
 
 	/* Don't allow silly-rename files be moved around. */
@@ -1936,6 +2072,8 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 	vnode = AFS_FS_I(d_inode(old_dentry));
 	orig_dvnode = AFS_FS_I(old_dir);
 	new_dvnode = AFS_FS_I(new_dir);
+	if (d_is_positive(new_dentry))
+		new_vnode = AFS_FS_I(d_inode(new_dentry));
 
 	_enter("{%llx:%llu},{%llx:%llu},{%llx:%llu},{%pd}",
 	       orig_dvnode->fid.vid, orig_dvnode->fid.vnode,
@@ -1947,11 +2085,20 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 	if (IS_ERR(op))
 		return PTR_ERR(op);
 
+	fscache_use_cookie(afs_vnode_cache(orig_dvnode), true);
+	if (new_dvnode != orig_dvnode)
+		fscache_use_cookie(afs_vnode_cache(new_dvnode), true);
+
 	ret = afs_validate(vnode, op->key);
 	afs_op_set_error(op, ret);
 	if (ret < 0)
 		goto error;
 
+	ret = -ENOMEM;
+	op->more_files = kvcalloc(2, sizeof(struct afs_vnode_param), GFP_KERNEL);
+	if (!op->more_files)
+		goto error;
+
 	afs_op_set_vnode(op, 0, orig_dvnode);
 	afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */
 	op->file[0].dv_delta = 1;
@@ -1960,46 +2107,63 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 	op->file[1].modification = true;
 	op->file[0].update_ctime = true;
 	op->file[1].update_ctime = true;
+	op->more_files[0].vnode		= vnode;
+	op->more_files[0].speculative	= true;
+	op->more_files[1].vnode		= new_vnode;
+	op->more_files[1].speculative	= true;
+	op->nr_files = 4;
 
 	op->dentry		= old_dentry;
 	op->dentry_2		= new_dentry;
+	op->rename.rename_flags	= flags;
 	op->rename.new_negative	= d_is_negative(new_dentry);
-	op->ops			= &afs_rename_operation;
 
-	/* For non-directories, check whether the target is busy and if so,
-	 * make a copy of the dentry and then do a silly-rename.  If the
-	 * silly-rename succeeds, the copied dentry is hashed and becomes the
-	 * new target.
-	 */
-	if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) {
-		/* To prevent any new references to the target during the
-		 * rename, we unhash the dentry in advance.
+	if (flags & RENAME_NOREPLACE) {
+		op->ops		= &afs_rename_noreplace_operation;
+	} else if (flags & RENAME_EXCHANGE) {
+		op->ops		= &afs_rename_exchange_operation;
+		d_drop(new_dentry);
+	} else {
+		/* If we might displace the target, we might need to do silly
+		 * rename.
 		 */
-		if (!d_unhashed(new_dentry)) {
-			d_drop(new_dentry);
-			op->rename.rehash = new_dentry;
-		}
+		op->ops	= &afs_rename_operation;
 
-		if (d_count(new_dentry) > 2) {
-			/* copy the target dentry's name */
-			op->rename.tmp = d_alloc(new_dentry->d_parent,
-						 &new_dentry->d_name);
-			if (!op->rename.tmp) {
-				afs_op_nomem(op);
-				goto error;
+		/* For non-directories, check whether the target is busy and if
+		 * so, make a copy of the dentry and then do a silly-rename.
+		 * If the silly-rename succeeds, the copied dentry is hashed
+		 * and becomes the new target.
+		 */
+		if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) {
+			/* To prevent any new references to the target during
+			 * the rename, we unhash the dentry in advance.
+			 */
+			if (!d_unhashed(new_dentry)) {
+				d_drop(new_dentry);
+				op->rename.rehash = new_dentry;
 			}
 
-			ret = afs_sillyrename(new_dvnode,
-					      AFS_FS_I(d_inode(new_dentry)),
-					      new_dentry, op->key);
-			if (ret) {
-				afs_op_set_error(op, ret);
-				goto error;
+			if (d_count(new_dentry) > 2) {
+				/* copy the target dentry's name */
+				op->rename.tmp = d_alloc(new_dentry->d_parent,
+							 &new_dentry->d_name);
+				if (!op->rename.tmp) {
+					afs_op_nomem(op);
+					goto error;
+				}
+
+				ret = afs_sillyrename(new_dvnode,
+						      AFS_FS_I(d_inode(new_dentry)),
+						      new_dentry, op->key);
+				if (ret) {
+					afs_op_set_error(op, ret);
+					goto error;
+				}
+
+				op->dentry_2 = op->rename.tmp;
+				op->rename.rehash = NULL;
+				op->rename.new_negative = true;
 			}
-
-			op->dentry_2 = op->rename.tmp;
-			op->rename.rehash = NULL;
-			op->rename.new_negative = true;
 		}
 	}
 
@@ -2014,47 +2178,45 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 	 */
 	d_drop(old_dentry);
 
-	return afs_do_sync_operation(op);
+	ret = afs_do_sync_operation(op);
+	if (ret == -ENOTSUPP)
+		ret = -EINVAL;
+out:
+	afs_dir_unuse_cookie(orig_dvnode, ret);
+	if (new_dvnode != orig_dvnode)
+		afs_dir_unuse_cookie(new_dvnode, ret);
+	return ret;
 
 error:
-	return afs_put_operation(op);
-}
-
-/*
- * Release a directory folio and clean up its private state if it's not busy
- * - return true if the folio can now be released, false if not
- */
-static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags)
-{
-	struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio));
-
-	_enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio->index);
-
-	folio_detach_private(folio);
-
-	/* The directory will need reloading. */
-	if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
-		afs_stat_v(dvnode, n_relpg);
-	return true;
+	ret = afs_put_operation(op);
+	goto out;
 }
 
 /*
- * Invalidate part or all of a folio.
+ * Write the file contents to the cache as a single blob.
  */
-static void afs_dir_invalidate_folio(struct folio *folio, size_t offset,
-				   size_t length)
+int afs_single_writepages(struct address_space *mapping,
+			  struct writeback_control *wbc)
 {
-	struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio));
-
-	_enter("{%lu},%zu,%zu", folio->index, offset, length);
-
-	BUG_ON(!folio_test_locked(folio));
+	struct afs_vnode *dvnode = AFS_FS_I(mapping->host);
+	struct iov_iter iter;
+	bool is_dir = (S_ISDIR(dvnode->netfs.inode.i_mode) &&
+		       !test_bit(AFS_VNODE_MOUNTPOINT, &dvnode->flags));
+	int ret = 0;
 
-	/* The directory will need reloading. */
-	if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
-		afs_stat_v(dvnode, n_inval);
+	/* Need to lock to prevent the folio queue and folios from being thrown
+	 * away.
+	 */
+	down_read(&dvnode->validate_lock);
+
+	if (is_dir ?
+	    test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) :
+	    atomic64_read(&dvnode->cb_expires_at) != AFS_NO_CB_PROMISE) {
+		iov_iter_folio_queue(&iter, ITER_SOURCE, dvnode->directory, 0, 0,
+				     i_size_read(&dvnode->netfs.inode));
+		ret = netfs_writeback_single(mapping, wbc, &iter);
+	}
 
-	/* we clean up only if the entire folio is being invalidated */
-	if (offset == 0 && length == folio_size(folio))
-		folio_detach_private(folio);
+	up_read(&dvnode->validate_lock);
+	return ret;
 }
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index e2fa577b66fe..fd3aa9f97ce6 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -10,6 +10,7 @@
 #include <linux/namei.h>
 #include <linux/pagemap.h>
 #include <linux/iversion.h>
+#include <linux/folio_queue.h>
 #include "internal.h"
 #include "xdr_fs.h"
 
@@ -105,32 +106,66 @@ static void afs_clear_contig_bits(union afs_xdr_dir_block *block,
 }
 
 /*
- * Get a new directory folio.
+ * Get a specific block, extending the directory storage to cover it as needed.
  */
-static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index)
+static union afs_xdr_dir_block *afs_dir_get_block(struct afs_dir_iter *iter, size_t block)
 {
-	struct address_space *mapping = vnode->netfs.inode.i_mapping;
+	struct folio_queue *fq;
+	struct afs_vnode *dvnode = iter->dvnode;
 	struct folio *folio;
+	size_t blpos = block * AFS_DIR_BLOCK_SIZE;
+	size_t blend = (block + 1) * AFS_DIR_BLOCK_SIZE, fpos = iter->fpos;
+	int ret;
+
+	if (dvnode->directory_size < blend) {
+		size_t cur_size = dvnode->directory_size;
+
+		ret = netfs_alloc_folioq_buffer(
+			NULL, &dvnode->directory, &cur_size, blend,
+			mapping_gfp_mask(dvnode->netfs.inode.i_mapping));
+		dvnode->directory_size = cur_size;
+		if (ret < 0)
+			goto fail;
+	}
 
-	folio = __filemap_get_folio(mapping, index,
-				    FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
-				    mapping->gfp_mask);
-	if (IS_ERR(folio)) {
-		clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
-		return NULL;
+	fq = iter->fq;
+	if (!fq)
+		fq = dvnode->directory;
+
+	/* Search the folio queue for the folio containing the block... */
+	for (; fq; fq = fq->next) {
+		for (int s = iter->fq_slot; s < folioq_count(fq); s++) {
+			size_t fsize = folioq_folio_size(fq, s);
+
+			if (blend <= fpos + fsize) {
+				/* ... and then return the mapped block. */
+				folio = folioq_folio(fq, s);
+				if (WARN_ON_ONCE(folio_pos(folio) != fpos))
+					goto fail;
+				iter->fq = fq;
+				iter->fq_slot = s;
+				iter->fpos = fpos;
+				return kmap_local_folio(folio, blpos - fpos);
+			}
+			fpos += fsize;
+		}
+		iter->fq_slot = 0;
 	}
-	if (!folio_test_private(folio))
-		folio_attach_private(folio, (void *)1);
-	return folio;
+
+fail:
+	iter->fq = NULL;
+	iter->fq_slot = 0;
+	afs_invalidate_dir(dvnode, afs_dir_invalid_edit_get_block);
+	return NULL;
 }
 
 /*
  * Scan a directory block looking for a dirent of the right name.
  */
-static int afs_dir_scan_block(union afs_xdr_dir_block *block, struct qstr *name,
+static int afs_dir_scan_block(const union afs_xdr_dir_block *block, const struct qstr *name,
 			      unsigned int blocknum)
 {
-	union afs_xdr_dirent *de;
+	const union afs_xdr_dirent *de;
 	u64 bitmap;
 	int d, len, n;
 
@@ -204,14 +239,13 @@ static void afs_edit_init_block(union afs_xdr_dir_block *meta,
  * The caller must hold the inode locked.
  */
 void afs_edit_dir_add(struct afs_vnode *vnode,
-		      struct qstr *name, struct afs_fid *new_fid,
+		      const struct qstr *name, struct afs_fid *new_fid,
 		      enum afs_edit_dir_reason why)
 {
 	union afs_xdr_dir_block *meta, *block;
 	union afs_xdr_dirent *de;
-	struct folio *folio0, *folio;
-	unsigned int need_slots, nr_blocks, b;
-	pgoff_t index;
+	struct afs_dir_iter iter = { .dvnode = vnode };
+	unsigned int nr_blocks, b, entry;
 	loff_t i_size;
 	int slot;
 
@@ -220,20 +254,17 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
 	i_size = i_size_read(&vnode->netfs.inode);
 	if (i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS ||
 	    (i_size & (AFS_DIR_BLOCK_SIZE - 1))) {
-		clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+		afs_invalidate_dir(vnode, afs_dir_invalid_edit_add_bad_size);
 		return;
 	}
 
-	folio0 = afs_dir_get_folio(vnode, 0);
-	if (!folio0) {
-		_leave(" [fgp]");
+	meta = afs_dir_get_block(&iter, 0);
+	if (!meta)
 		return;
-	}
 
 	/* Work out how many slots we're going to need. */
-	need_slots = afs_dir_calc_slots(name->len);
+	iter.nr_slots = afs_dir_calc_slots(name->len);
 
-	meta = kmap_local_folio(folio0, 0);
 	if (i_size == 0)
 		goto new_directory;
 	nr_blocks = i_size / AFS_DIR_BLOCK_SIZE;
@@ -245,22 +276,21 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
 		/* If the directory extended into a new folio, then we need to
 		 * tack a new folio on the end.
 		 */
-		index = b / AFS_DIR_BLOCKS_PER_PAGE;
 		if (nr_blocks >= AFS_DIR_MAX_BLOCKS)
-			goto error;
-		if (index >= folio_nr_pages(folio0)) {
-			folio = afs_dir_get_folio(vnode, index);
-			if (!folio)
-				goto error;
-		} else {
-			folio = folio0;
-		}
+			goto error_too_many_blocks;
 
-		block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_file_pos(folio));
+		/* Lower dir blocks have a counter in the header we can check. */
+		if (b < AFS_DIR_BLOCKS_WITH_CTR &&
+		    meta->meta.alloc_ctrs[b] < iter.nr_slots)
+			continue;
+
+		block = afs_dir_get_block(&iter, b);
+		if (!block)
+			goto error;
 
 		/* Abandon the edit if we got a callback break. */
 		if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
-			goto invalidated;
+			goto already_invalidated;
 
 		_debug("block %u: %2u %3u %u",
 		       b,
@@ -275,31 +305,23 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
 			afs_set_i_size(vnode, (b + 1) * AFS_DIR_BLOCK_SIZE);
 		}
 
-		/* Only lower dir blocks have a counter in the header. */
-		if (b >= AFS_DIR_BLOCKS_WITH_CTR ||
-		    meta->meta.alloc_ctrs[b] >= need_slots) {
-			/* We need to try and find one or more consecutive
-			 * slots to hold the entry.
-			 */
-			slot = afs_find_contig_bits(block, need_slots);
-			if (slot >= 0) {
-				_debug("slot %u", slot);
-				goto found_space;
-			}
+		/* We need to try and find one or more consecutive slots to
+		 * hold the entry.
+		 */
+		slot = afs_find_contig_bits(block, iter.nr_slots);
+		if (slot >= 0) {
+			_debug("slot %u", slot);
+			goto found_space;
 		}
 
 		kunmap_local(block);
-		if (folio != folio0) {
-			folio_unlock(folio);
-			folio_put(folio);
-		}
 	}
 
 	/* There are no spare slots of sufficient size, yet the operation
 	 * succeeded.  Download the directory again.
 	 */
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_create_nospc, 0, 0, 0, 0, name->name);
-	clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+	afs_invalidate_dir(vnode, afs_dir_invalid_edit_add_no_slots);
 	goto out_unmap;
 
 new_directory:
@@ -307,8 +329,7 @@ new_directory:
 	i_size = AFS_DIR_BLOCK_SIZE;
 	afs_set_i_size(vnode, i_size);
 	slot = AFS_DIR_RESV_BLOCKS0;
-	folio = folio0;
-	block = kmap_local_folio(folio, 0);
+	block = afs_dir_get_block(&iter, 0);
 	nr_blocks = 1;
 	b = 0;
 
@@ -326,41 +347,39 @@ found_space:
 	de->u.name[name->len] = 0;
 
 	/* Adjust the bitmap. */
-	afs_set_contig_bits(block, slot, need_slots);
-	kunmap_local(block);
-	if (folio != folio0) {
-		folio_unlock(folio);
-		folio_put(folio);
-	}
+	afs_set_contig_bits(block, slot, iter.nr_slots);
 
 	/* Adjust the allocation counter. */
 	if (b < AFS_DIR_BLOCKS_WITH_CTR)
-		meta->meta.alloc_ctrs[b] -= need_slots;
+		meta->meta.alloc_ctrs[b] -= iter.nr_slots;
+
+	/* Adjust the hash chain. */
+	entry = b * AFS_DIR_SLOTS_PER_BLOCK + slot;
+	iter.bucket = afs_dir_hash_name(name);
+	de->u.hash_next = meta->meta.hashtable[iter.bucket];
+	meta->meta.hashtable[iter.bucket] = htons(entry);
+	kunmap_local(block);
 
 	inode_inc_iversion_raw(&vnode->netfs.inode);
 	afs_stat_v(vnode, n_dir_cr);
 	_debug("Insert %s in %u[%u]", name->name, b, slot);
 
+	netfs_single_mark_inode_dirty(&vnode->netfs.inode);
+
 out_unmap:
 	kunmap_local(meta);
-	folio_unlock(folio0);
-	folio_put(folio0);
 	_leave("");
 	return;
 
-invalidated:
+already_invalidated:
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_create_inval, 0, 0, 0, 0, name->name);
-	clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
 	kunmap_local(block);
-	if (folio != folio0) {
-		folio_unlock(folio);
-		folio_put(folio);
-	}
 	goto out_unmap;
 
+error_too_many_blocks:
+	afs_invalidate_dir(vnode, afs_dir_invalid_edit_add_too_many_blocks);
 error:
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_create_error, 0, 0, 0, 0, name->name);
-	clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
 	goto out_unmap;
 }
 
@@ -372,15 +391,16 @@ error:
  * The caller must hold the inode locked.
  */
 void afs_edit_dir_remove(struct afs_vnode *vnode,
-			 struct qstr *name, enum afs_edit_dir_reason why)
+			 const struct qstr *name, enum afs_edit_dir_reason why)
 {
-	union afs_xdr_dir_block *meta, *block;
-	union afs_xdr_dirent *de;
-	struct folio *folio0, *folio;
-	unsigned int need_slots, nr_blocks, b;
-	pgoff_t index;
+	union afs_xdr_dir_block *meta, *block, *pblock;
+	union afs_xdr_dirent *de, *pde;
+	struct afs_dir_iter iter = { .dvnode = vnode };
+	struct afs_fid fid;
+	unsigned int b, slot, entry;
 	loff_t i_size;
-	int slot;
+	__be16 next;
+	int found;
 
 	_enter(",,{%d,%s},", name->len, name->name);
 
@@ -388,81 +408,95 @@ void afs_edit_dir_remove(struct afs_vnode *vnode,
 	if (i_size < AFS_DIR_BLOCK_SIZE ||
 	    i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS ||
 	    (i_size & (AFS_DIR_BLOCK_SIZE - 1))) {
-		clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+		afs_invalidate_dir(vnode, afs_dir_invalid_edit_rem_bad_size);
 		return;
 	}
-	nr_blocks = i_size / AFS_DIR_BLOCK_SIZE;
 
-	folio0 = afs_dir_get_folio(vnode, 0);
-	if (!folio0) {
-		_leave(" [fgp]");
+	if (!afs_dir_init_iter(&iter, name))
 		return;
-	}
-
-	/* Work out how many slots we're going to discard. */
-	need_slots = afs_dir_calc_slots(name->len);
-
-	meta = kmap_local_folio(folio0, 0);
-
-	/* Find a block that has sufficient slots available.  Each folio
-	 * contains two or more directory blocks.
-	 */
-	for (b = 0; b < nr_blocks; b++) {
-		index = b / AFS_DIR_BLOCKS_PER_PAGE;
-		if (index >= folio_nr_pages(folio0)) {
-			folio = afs_dir_get_folio(vnode, index);
-			if (!folio)
-				goto error;
-		} else {
-			folio = folio0;
-		}
 
-		block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_file_pos(folio));
-
-		/* Abandon the edit if we got a callback break. */
-		if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
-			goto invalidated;
-
-		if (b > AFS_DIR_BLOCKS_WITH_CTR ||
-		    meta->meta.alloc_ctrs[b] <= AFS_DIR_SLOTS_PER_BLOCK - 1 - need_slots) {
-			slot = afs_dir_scan_block(block, name, b);
-			if (slot >= 0)
-				goto found_dirent;
-		}
+	meta = afs_dir_find_block(&iter, 0);
+	if (!meta)
+		return;
 
-		kunmap_local(block);
-		if (folio != folio0) {
-			folio_unlock(folio);
-			folio_put(folio);
-		}
+	/* Find the entry in the blob. */
+	found = afs_dir_search_bucket(&iter, name, &fid);
+	if (found < 0) {
+		/* Didn't find the dirent to clobber.  Re-download. */
+		trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_noent,
+				   0, 0, 0, 0, name->name);
+		afs_invalidate_dir(vnode, afs_dir_invalid_edit_rem_wrong_name);
+		goto out_unmap;
 	}
 
-	/* Didn't find the dirent to clobber.  Download the directory again. */
-	trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_noent,
-			   0, 0, 0, 0, name->name);
-	clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
-	goto out_unmap;
+	entry = found;
+	b    = entry / AFS_DIR_SLOTS_PER_BLOCK;
+	slot = entry % AFS_DIR_SLOTS_PER_BLOCK;
 
-found_dirent:
+	block = afs_dir_find_block(&iter, b);
+	if (!block)
+		goto error;
+	if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
+		goto already_invalidated;
+
+	/* Check and clear the entry. */
 	de = &block->dirents[slot];
+	if (de->u.valid != 1)
+		goto error_unmap;
 
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_delete, b, slot,
 			   ntohl(de->u.vnode), ntohl(de->u.unique),
 			   name->name);
 
-	memset(de, 0, sizeof(*de) * need_slots);
-
 	/* Adjust the bitmap. */
-	afs_clear_contig_bits(block, slot, need_slots);
-	kunmap_local(block);
-	if (folio != folio0) {
-		folio_unlock(folio);
-		folio_put(folio);
-	}
+	afs_clear_contig_bits(block, slot, iter.nr_slots);
 
 	/* Adjust the allocation counter. */
 	if (b < AFS_DIR_BLOCKS_WITH_CTR)
-		meta->meta.alloc_ctrs[b] += need_slots;
+		meta->meta.alloc_ctrs[b] += iter.nr_slots;
+
+	/* Clear the constituent entries. */
+	next = de->u.hash_next;
+	memset(de, 0, sizeof(*de) * iter.nr_slots);
+	kunmap_local(block);
+
+	/* Adjust the hash chain: if iter->prev_entry is 0, the hashtable head
+	 * index is previous; otherwise it's slot number of the previous entry.
+	 */
+	if (!iter.prev_entry) {
+		__be16 prev_next = meta->meta.hashtable[iter.bucket];
+
+		if (unlikely(prev_next != htons(entry))) {
+			pr_warn("%llx:%llx:%x: not head of chain b=%x p=%x,%x e=%x %*s",
+				vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique,
+				iter.bucket, iter.prev_entry, prev_next, entry,
+				name->len, name->name);
+			goto error;
+		}
+		meta->meta.hashtable[iter.bucket] = next;
+	} else {
+		unsigned int pb = iter.prev_entry / AFS_DIR_SLOTS_PER_BLOCK;
+		unsigned int ps = iter.prev_entry % AFS_DIR_SLOTS_PER_BLOCK;
+		__be16 prev_next;
+
+		pblock = afs_dir_find_block(&iter, pb);
+		if (!pblock)
+			goto error;
+		pde = &pblock->dirents[ps];
+		prev_next = pde->u.hash_next;
+		if (prev_next != htons(entry)) {
+			kunmap_local(pblock);
+			pr_warn("%llx:%llx:%x: not prev in chain b=%x p=%x,%x e=%x %*s",
+				vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique,
+				iter.bucket, iter.prev_entry, prev_next, entry,
+				name->len, name->name);
+			goto error;
+		}
+		pde->u.hash_next = next;
+		kunmap_local(pblock);
+	}
+
+	netfs_single_mark_inode_dirty(&vnode->netfs.inode);
 
 	inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version);
 	afs_stat_v(vnode, n_dir_rm);
@@ -470,25 +504,145 @@ found_dirent:
 
 out_unmap:
 	kunmap_local(meta);
-	folio_unlock(folio0);
-	folio_put(folio0);
 	_leave("");
 	return;
 
-invalidated:
+already_invalidated:
+	kunmap_local(block);
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_inval,
 			   0, 0, 0, 0, name->name);
-	clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
-	kunmap_local(block);
-	if (folio != folio0) {
-		folio_unlock(folio);
-		folio_put(folio);
-	}
 	goto out_unmap;
 
+error_unmap:
+	kunmap_local(block);
 error:
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_error,
 			   0, 0, 0, 0, name->name);
-	clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
 	goto out_unmap;
 }
+
+/*
+ * Edit an entry in a directory to update the vnode it refers to.  This is also
+ * used to update the ".." entry in a directory.
+ */
+void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name,
+			 struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why)
+{
+	union afs_xdr_dir_block *block;
+	union afs_xdr_dirent *de;
+	struct afs_dir_iter iter = { .dvnode = vnode };
+	unsigned int nr_blocks, b;
+	loff_t i_size;
+	int slot;
+
+	_enter("");
+
+	i_size = i_size_read(&vnode->netfs.inode);
+	if (i_size < AFS_DIR_BLOCK_SIZE) {
+		afs_invalidate_dir(vnode, afs_dir_invalid_edit_upd_bad_size);
+		return;
+	}
+
+	nr_blocks = i_size / AFS_DIR_BLOCK_SIZE;
+
+	/* Find a block that has sufficient slots available.  Each folio
+	 * contains two or more directory blocks.
+	 */
+	for (b = 0; b < nr_blocks; b++) {
+		block = afs_dir_get_block(&iter, b);
+		if (!block)
+			goto error;
+
+		/* Abandon the edit if we got a callback break. */
+		if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
+			goto already_invalidated;
+
+		slot = afs_dir_scan_block(block, name, b);
+		if (slot >= 0)
+			goto found_dirent;
+
+		kunmap_local(block);
+	}
+
+	/* Didn't find the dirent to clobber.  Download the directory again. */
+	trace_afs_edit_dir(vnode, why, afs_edit_dir_update_nodd,
+			   0, 0, 0, 0, name->name);
+	afs_invalidate_dir(vnode, afs_dir_invalid_edit_upd_no_dd);
+	goto out;
+
+found_dirent:
+	de = &block->dirents[slot];
+	de->u.vnode  = htonl(new_dvnode->fid.vnode);
+	de->u.unique = htonl(new_dvnode->fid.unique);
+
+	trace_afs_edit_dir(vnode, why, afs_edit_dir_update_dd, b, slot,
+			   ntohl(de->u.vnode), ntohl(de->u.unique), name->name);
+
+	kunmap_local(block);
+	netfs_single_mark_inode_dirty(&vnode->netfs.inode);
+	inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version);
+
+out:
+	_leave("");
+	return;
+
+already_invalidated:
+	kunmap_local(block);
+	trace_afs_edit_dir(vnode, why, afs_edit_dir_update_inval,
+			   0, 0, 0, 0, name->name);
+	goto out;
+
+error:
+	trace_afs_edit_dir(vnode, why, afs_edit_dir_update_error,
+			   0, 0, 0, 0, name->name);
+	goto out;
+}
+
+/*
+ * Initialise a new directory.  We need to fill in the "." and ".." entries.
+ */
+void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_dvnode)
+{
+	union afs_xdr_dir_block *meta;
+	struct afs_dir_iter iter = { .dvnode = dvnode };
+	union afs_xdr_dirent *de;
+	unsigned int slot = AFS_DIR_RESV_BLOCKS0;
+	loff_t i_size;
+
+	i_size = i_size_read(&dvnode->netfs.inode);
+	if (i_size != AFS_DIR_BLOCK_SIZE) {
+		afs_invalidate_dir(dvnode, afs_dir_invalid_edit_add_bad_size);
+		return;
+	}
+
+	meta = afs_dir_get_block(&iter, 0);
+	if (!meta)
+		return;
+
+	afs_edit_init_block(meta, meta, 0);
+
+	de = &meta->dirents[slot];
+	de->u.valid  = 1;
+	de->u.vnode  = htonl(dvnode->fid.vnode);
+	de->u.unique = htonl(dvnode->fid.unique);
+	memcpy(de->u.name, ".", 2);
+	trace_afs_edit_dir(dvnode, afs_edit_dir_for_mkdir, afs_edit_dir_mkdir, 0, slot,
+			   dvnode->fid.vnode, dvnode->fid.unique, ".");
+	slot++;
+
+	de = &meta->dirents[slot];
+	de->u.valid  = 1;
+	de->u.vnode  = htonl(parent_dvnode->fid.vnode);
+	de->u.unique = htonl(parent_dvnode->fid.unique);
+	memcpy(de->u.name, "..", 3);
+	trace_afs_edit_dir(dvnode, afs_edit_dir_for_mkdir, afs_edit_dir_mkdir, 0, slot,
+			   parent_dvnode->fid.vnode, parent_dvnode->fid.unique, "..");
+
+	afs_set_contig_bits(meta, AFS_DIR_RESV_BLOCKS0, 2);
+	meta->meta.alloc_ctrs[0] -= 2;
+	kunmap_local(meta);
+
+	netfs_single_mark_inode_dirty(&dvnode->netfs.inode);
+	set_bit(AFS_VNODE_DIR_VALID, &dvnode->flags);
+	set_bit(AFS_VNODE_DIR_READ, &dvnode->flags);
+}
diff --git a/fs/afs/dir_search.c b/fs/afs/dir_search.c
new file mode 100644
index 000000000000..d2516e55b5ed
--- /dev/null
+++ b/fs/afs/dir_search.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Search a directory's hash table.
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * https://tools.ietf.org/html/draft-keiser-afs3-directory-object-00
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/iversion.h>
+#include "internal.h"
+#include "afs_fs.h"
+#include "xdr_fs.h"
+
+/*
+ * Calculate the name hash.
+ */
+unsigned int afs_dir_hash_name(const struct qstr *name)
+{
+	const unsigned char *p = name->name;
+	unsigned int hash = 0, i;
+	int bucket;
+
+	for (i = 0; i < name->len; i++)
+		hash = (hash * 173) + p[i];
+	bucket = hash & (AFS_DIR_HASHTBL_SIZE - 1);
+	if (hash > INT_MAX) {
+		bucket = AFS_DIR_HASHTBL_SIZE - bucket;
+		bucket &= (AFS_DIR_HASHTBL_SIZE - 1);
+	}
+	return bucket;
+}
+
+/*
+ * Reset a directory iterator.
+ */
+static bool afs_dir_reset_iter(struct afs_dir_iter *iter)
+{
+	unsigned long long i_size = i_size_read(&iter->dvnode->netfs.inode);
+	unsigned int nblocks;
+
+	/* Work out the maximum number of steps we can take. */
+	nblocks = umin(i_size / AFS_DIR_BLOCK_SIZE, AFS_DIR_MAX_BLOCKS);
+	if (!nblocks)
+		return false;
+	iter->loop_check = nblocks * (AFS_DIR_SLOTS_PER_BLOCK - AFS_DIR_RESV_BLOCKS);
+	iter->prev_entry = 0; /* Hash head is previous */
+	return true;
+}
+
+/*
+ * Initialise a directory iterator for looking up a name.
+ */
+bool afs_dir_init_iter(struct afs_dir_iter *iter, const struct qstr *name)
+{
+	iter->nr_slots = afs_dir_calc_slots(name->len);
+	iter->bucket = afs_dir_hash_name(name);
+	return afs_dir_reset_iter(iter);
+}
+
+/*
+ * Get a specific block.
+ */
+union afs_xdr_dir_block *afs_dir_find_block(struct afs_dir_iter *iter, size_t block)
+{
+	struct folio_queue *fq = iter->fq;
+	struct afs_vnode *dvnode = iter->dvnode;
+	struct folio *folio;
+	size_t blpos = block * AFS_DIR_BLOCK_SIZE;
+	size_t blend = (block + 1) * AFS_DIR_BLOCK_SIZE, fpos = iter->fpos;
+	int slot = iter->fq_slot;
+
+	_enter("%zx,%d", block, slot);
+
+	if (iter->block) {
+		kunmap_local(iter->block);
+		iter->block = NULL;
+	}
+
+	if (dvnode->directory_size < blend)
+		goto fail;
+
+	if (!fq || blpos < fpos) {
+		fq = dvnode->directory;
+		slot = 0;
+		fpos = 0;
+	}
+
+	/* Search the folio queue for the folio containing the block... */
+	for (; fq; fq = fq->next) {
+		for (; slot < folioq_count(fq); slot++) {
+			size_t fsize = folioq_folio_size(fq, slot);
+
+			if (blend <= fpos + fsize) {
+				/* ... and then return the mapped block. */
+				folio = folioq_folio(fq, slot);
+				if (WARN_ON_ONCE(folio_pos(folio) != fpos))
+					goto fail;
+				iter->fq = fq;
+				iter->fq_slot = slot;
+				iter->fpos = fpos;
+				iter->block = kmap_local_folio(folio, blpos - fpos);
+				return iter->block;
+			}
+			fpos += fsize;
+		}
+		slot = 0;
+	}
+
+fail:
+	iter->fq = NULL;
+	iter->fq_slot = 0;
+	afs_invalidate_dir(dvnode, afs_dir_invalid_edit_get_block);
+	return NULL;
+}
+
+/*
+ * Search through a directory bucket.
+ */
+int afs_dir_search_bucket(struct afs_dir_iter *iter, const struct qstr *name,
+			  struct afs_fid *_fid)
+{
+	const union afs_xdr_dir_block *meta;
+	unsigned int entry;
+	int ret = -ESTALE;
+
+	meta = afs_dir_find_block(iter, 0);
+	if (!meta)
+		return -ESTALE;
+
+	entry = ntohs(meta->meta.hashtable[iter->bucket & (AFS_DIR_HASHTBL_SIZE - 1)]);
+	_enter("%x,%x", iter->bucket, entry);
+
+	while (entry) {
+		const union afs_xdr_dir_block *block;
+		const union afs_xdr_dirent *dire;
+		unsigned int blnum = entry / AFS_DIR_SLOTS_PER_BLOCK;
+		unsigned int slot = entry % AFS_DIR_SLOTS_PER_BLOCK;
+		unsigned int resv = (blnum == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS);
+
+		_debug("search %x", entry);
+
+		if (slot < resv) {
+			kdebug("slot out of range h=%x rs=%2x sl=%2x-%2x",
+			       iter->bucket, resv, slot, slot + iter->nr_slots - 1);
+			goto bad;
+		}
+
+		block = afs_dir_find_block(iter, blnum);
+		if (!block)
+			goto bad;
+		dire = &block->dirents[slot];
+
+		if (slot + iter->nr_slots <= AFS_DIR_SLOTS_PER_BLOCK &&
+		    memcmp(dire->u.name, name->name, name->len) == 0 &&
+		    dire->u.name[name->len] == '\0') {
+			_fid->vnode  = ntohl(dire->u.vnode);
+			_fid->unique = ntohl(dire->u.unique);
+			ret = entry;
+			goto found;
+		}
+
+		iter->prev_entry = entry;
+		entry = ntohs(dire->u.hash_next);
+		if (!--iter->loop_check) {
+			kdebug("dir chain loop h=%x", iter->bucket);
+			goto bad;
+		}
+	}
+
+	ret = -ENOENT;
+found:
+	if (iter->block) {
+		kunmap_local(iter->block);
+		iter->block = NULL;
+	}
+
+bad:
+	if (ret == -ESTALE)
+		afs_invalidate_dir(iter->dvnode, afs_dir_invalid_iter_stale);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * Search the appropriate hash chain in the contents of an AFS directory.
+ */
+int afs_dir_search(struct afs_vnode *dvnode, const struct qstr *name,
+		   struct afs_fid *_fid, afs_dataversion_t *_dir_version)
+{
+	struct afs_dir_iter iter = { .dvnode = dvnode, };
+	int ret, retry_limit = 3;
+
+	_enter("{%lu},,,", dvnode->netfs.inode.i_ino);
+
+	if (!afs_dir_init_iter(&iter, name))
+		return -ENOENT;
+	do {
+		if (--retry_limit < 0) {
+			pr_warn("afs_read_dir(): Too many retries\n");
+			ret = -ESTALE;
+			break;
+		}
+		ret = afs_read_dir(dvnode, NULL);
+		if (ret < 0) {
+			if (ret != -ESTALE)
+				break;
+			if (test_bit(AFS_VNODE_DELETED, &dvnode->flags)) {
+				ret = -ESTALE;
+				break;
+			}
+			continue;
+		}
+		*_dir_version = inode_peek_iversion_raw(&dvnode->netfs.inode);
+
+		ret = afs_dir_search_bucket(&iter, name, _fid);
+		up_read(&dvnode->validate_lock);
+		if (ret == -ESTALE)
+			afs_dir_reset_iter(&iter);
+	} while (ret == -ESTALE);
+
+	_leave(" = %d", ret);
+	return ret;
+}
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index a1e581946b93..014495d4b868 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -69,6 +69,12 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
 	if (IS_ERR(op))
 		return PTR_ERR(op);
 
+	op->more_files = kvcalloc(2, sizeof(struct afs_vnode_param), GFP_KERNEL);
+	if (!op->more_files) {
+		afs_put_operation(op);
+		return -ENOMEM;
+	}
+
 	afs_op_set_vnode(op, 0, dvnode);
 	afs_op_set_vnode(op, 1, dvnode);
 	op->file[0].dv_delta = 1;
@@ -77,6 +83,11 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
 	op->file[1].modification = true;
 	op->file[0].update_ctime = true;
 	op->file[1].update_ctime = true;
+	op->more_files[0].vnode		= AFS_FS_I(d_inode(old));
+	op->more_files[0].speculative	= true;
+	op->more_files[1].vnode		= AFS_FS_I(d_inode(new));
+	op->more_files[1].speculative	= true;
+	op->nr_files = 4;
 
 	op->dentry		= old;
 	op->dentry_2		= new;
@@ -113,16 +124,14 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode,
 
 	sdentry = NULL;
 	do {
-		int slen;
-
 		dput(sdentry);
 		sillycounter++;
 
 		/* Create a silly name.  Note that the ".__afs" prefix is
 		 * understood by the salvager and must not be changed.
 		 */
-		slen = scnprintf(silly, sizeof(silly), ".__afs%04X", sillycounter);
-		sdentry = lookup_one_len(silly, dentry->d_parent, slen);
+		scnprintf(silly, sizeof(silly), ".__afs%04X", sillycounter);
+		sdentry = lookup_noperm(&QSTR(silly), dentry->d_parent);
 
 		/* N.B. Better to return EBUSY here ... it could be dangerous
 		 * to delete the file while it's in use.
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index c4d2711e20ad..aa56e8951e03 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -10,16 +10,19 @@
 #include <linux/dns_resolver.h>
 #include "internal.h"
 
-static atomic_t afs_autocell_ino;
+#define AFS_MIN_DYNROOT_CELL_INO 4 /* Allow for ., .., @cell, .@cell */
+#define AFS_MAX_DYNROOT_CELL_INO ((unsigned int)INT_MAX)
+
+static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry, ino_t ino);
 
 /*
  * iget5() comparator for inode created by autocell operations
- *
- * These pseudo inodes don't match anything.
  */
 static int afs_iget5_pseudo_test(struct inode *inode, void *opaque)
 {
-	return 0;
+	struct afs_fid *fid = opaque;
+
+	return inode->i_ino == fid->vnode;
 }
 
 /*
@@ -39,28 +42,16 @@ static int afs_iget5_pseudo_set(struct inode *inode, void *opaque)
 }
 
 /*
- * Create an inode for a dynamic root directory or an autocell dynamic
- * automount dir.
+ * Create an inode for an autocell dynamic automount dir.
  */
-struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
+static struct inode *afs_iget_pseudo_dir(struct super_block *sb, ino_t ino)
 {
-	struct afs_super_info *as = AFS_FS_S(sb);
 	struct afs_vnode *vnode;
 	struct inode *inode;
-	struct afs_fid fid = {};
+	struct afs_fid fid = { .vnode = ino, .unique = 1, };
 
 	_enter("");
 
-	if (as->volume)
-		fid.vid = as->volume->vid;
-	if (root) {
-		fid.vnode = 1;
-		fid.unique = 1;
-	} else {
-		fid.vnode = atomic_inc_return(&afs_autocell_ino);
-		fid.unique = 0;
-	}
-
 	inode = iget5_locked(sb, fid.vnode,
 			     afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid);
 	if (!inode) {
@@ -73,163 +64,76 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
 
 	vnode = AFS_FS_I(inode);
 
-	/* there shouldn't be an existing inode */
-	BUG_ON(!(inode->i_state & I_NEW));
-
-	netfs_inode_init(&vnode->netfs, NULL, false);
-	inode->i_size		= 0;
-	inode->i_mode		= S_IFDIR | S_IRUGO | S_IXUGO;
-	if (root) {
-		inode->i_op	= &afs_dynroot_inode_operations;
-		inode->i_fop	= &simple_dir_operations;
-	} else {
-		inode->i_op	= &afs_autocell_inode_operations;
-	}
-	set_nlink(inode, 2);
-	inode->i_uid		= GLOBAL_ROOT_UID;
-	inode->i_gid		= GLOBAL_ROOT_GID;
-	simple_inode_init_ts(inode);
-	inode->i_blocks		= 0;
-	inode->i_generation	= 0;
-
-	set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
-	if (!root) {
+	if (inode_state_read_once(inode) & I_NEW) {
+		netfs_inode_init(&vnode->netfs, NULL, false);
+		simple_inode_init_ts(inode);
+		set_nlink(inode, 2);
+		inode->i_size		= 0;
+		inode->i_mode		= S_IFDIR | 0555;
+		inode->i_op		= &afs_autocell_inode_operations;
+		inode->i_uid		= GLOBAL_ROOT_UID;
+		inode->i_gid		= GLOBAL_ROOT_GID;
+		inode->i_blocks		= 0;
+		inode->i_generation	= 0;
+		inode->i_flags		|= S_AUTOMOUNT | S_NOATIME;
+
+		set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
 		set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
-		inode->i_flags |= S_AUTOMOUNT;
-	}
 
-	inode->i_flags |= S_NOATIME;
-	unlock_new_inode(inode);
+		unlock_new_inode(inode);
+	}
 	_leave(" = %p", inode);
 	return inode;
 }
 
 /*
- * Probe to see if a cell may exist.  This prevents positive dentries from
- * being created unnecessarily.
+ * Try to automount the mountpoint with pseudo directory, if the autocell
+ * option is set.
  */
-static int afs_probe_cell_name(struct dentry *dentry)
+static struct dentry *afs_dynroot_lookup_cell(struct inode *dir, struct dentry *dentry,
+					      unsigned int flags)
 {
-	struct afs_cell *cell;
+	struct afs_cell *cell = NULL;
 	struct afs_net *net = afs_d2net(dentry);
+	struct inode *inode = NULL;
 	const char *name = dentry->d_name.name;
 	size_t len = dentry->d_name.len;
-	char *result = NULL;
-	int ret;
+	bool dotted = false;
+	int ret = -ENOENT;
 
 	/* Names prefixed with a dot are R/W mounts. */
 	if (name[0] == '.') {
-		if (len == 1)
-			return -EINVAL;
 		name++;
 		len--;
+		dotted = true;
 	}
 
-	cell = afs_find_cell(net, name, len, afs_cell_trace_use_probe);
-	if (!IS_ERR(cell)) {
-		afs_unuse_cell(net, cell, afs_cell_trace_unuse_probe);
-		return 0;
-	}
-
-	ret = dns_query(net->net, "afsdb", name, len, "srv=1",
-			&result, NULL, false);
-	if (ret == -ENODATA || ret == -ENOKEY || ret == 0)
-		ret = -ENOENT;
-	if (ret > 0 && ret >= sizeof(struct dns_server_list_v1_header)) {
-		struct dns_server_list_v1_header *v1 = (void *)result;
-
-		if (v1->hdr.zero == 0 &&
-		    v1->hdr.content == DNS_PAYLOAD_IS_SERVER_LIST &&
-		    v1->hdr.version == 1 &&
-		    (v1->status != DNS_LOOKUP_GOOD &&
-		     v1->status != DNS_LOOKUP_GOOD_WITH_BAD))
-			return -ENOENT;
-
+	cell = afs_lookup_cell(net, name, len, NULL,
+			       AFS_LOOKUP_CELL_DYNROOT,
+			       afs_cell_trace_use_lookup_dynroot);
+	if (IS_ERR(cell)) {
+		ret = PTR_ERR(cell);
+		goto out_no_cell;
 	}
 
-	kfree(result);
-	return ret;
-}
-
-/*
- * Try to auto mount the mountpoint with pseudo directory, if the autocell
- * operation is setted.
- */
-struct inode *afs_try_auto_mntpt(struct dentry *dentry, struct inode *dir)
-{
-	struct afs_vnode *vnode = AFS_FS_I(dir);
-	struct inode *inode;
-	int ret = -ENOENT;
-
-	_enter("%p{%pd}, {%llx:%llu}",
-	       dentry, dentry, vnode->fid.vid, vnode->fid.vnode);
-
-	if (!test_bit(AFS_VNODE_AUTOCELL, &vnode->flags))
-		goto out;
-
-	ret = afs_probe_cell_name(dentry);
-	if (ret < 0)
-		goto out;
-
-	inode = afs_iget_pseudo_dir(dir->i_sb, false);
+	inode = afs_iget_pseudo_dir(dir->i_sb, cell->dynroot_ino * 2 + dotted);
 	if (IS_ERR(inode)) {
 		ret = PTR_ERR(inode);
 		goto out;
 	}
 
-	_leave("= %p", inode);
-	return inode;
+	dentry->d_fsdata = cell;
+	return d_splice_alias(inode, dentry);
 
 out:
-	_leave("= %d", ret);
+	afs_unuse_cell(cell, afs_cell_trace_unuse_lookup_dynroot);
+out_no_cell:
+	if (!inode)
+		return d_splice_alias(inode, dentry);
 	return ret == -ENOENT ? NULL : ERR_PTR(ret);
 }
 
 /*
- * Look up @cell in a dynroot directory.  This is a substitution for the
- * local cell name for the net namespace.
- */
-static struct dentry *afs_lookup_atcell(struct dentry *dentry)
-{
-	struct afs_cell *cell;
-	struct afs_net *net = afs_d2net(dentry);
-	struct dentry *ret;
-	char *name;
-	int len;
-
-	if (!net->ws_cell)
-		return ERR_PTR(-ENOENT);
-
-	ret = ERR_PTR(-ENOMEM);
-	name = kmalloc(AFS_MAXCELLNAME + 1, GFP_KERNEL);
-	if (!name)
-		goto out_p;
-
-	down_read(&net->cells_lock);
-	cell = net->ws_cell;
-	if (cell) {
-		len = cell->name_len;
-		memcpy(name, cell->name, len + 1);
-	}
-	up_read(&net->cells_lock);
-
-	ret = ERR_PTR(-ENOENT);
-	if (!cell)
-		goto out_n;
-
-	ret = lookup_one_len(name, dentry->d_parent, len);
-
-	/* We don't want to d_add() the @cell dentry here as we don't want to
-	 * the cached dentry to hide changes to the local cell name.
-	 */
-
-out_n:
-	kfree(name);
-out_p:
-	return ret;
-}
-
-/*
  * Look up an entry in a dynroot directory.
  */
 static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentry,
@@ -237,8 +141,6 @@ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentr
 {
 	_enter("%pd", dentry);
 
-	ASSERTCMP(d_inode(dentry), ==, NULL);
-
 	if (flags & LOOKUP_CREATE)
 		return ERR_PTR(-EOPNOTSUPP);
 
@@ -249,141 +151,256 @@ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentr
 
 	if (dentry->d_name.len == 5 &&
 	    memcmp(dentry->d_name.name, "@cell", 5) == 0)
-		return afs_lookup_atcell(dentry);
+		return afs_lookup_atcell(dir, dentry, 2);
+
+	if (dentry->d_name.len == 6 &&
+	    memcmp(dentry->d_name.name, ".@cell", 6) == 0)
+		return afs_lookup_atcell(dir, dentry, 3);
 
-	return d_splice_alias(afs_try_auto_mntpt(dentry, dir), dentry);
+	return afs_dynroot_lookup_cell(dir, dentry, flags);
 }
 
 const struct inode_operations afs_dynroot_inode_operations = {
 	.lookup		= afs_dynroot_lookup,
 };
 
+static void afs_dynroot_d_release(struct dentry *dentry)
+{
+	struct afs_cell *cell = dentry->d_fsdata;
+
+	afs_unuse_cell(cell, afs_cell_trace_unuse_dynroot_mntpt);
+}
+
+/*
+ * Keep @cell symlink dentries around, but only keep cell autodirs when they're
+ * being used.
+ */
+static int afs_dynroot_delete_dentry(const struct dentry *dentry)
+{
+	const struct qstr *name = &dentry->d_name;
+
+	if (name->len == 5 && memcmp(name->name, "@cell", 5) == 0)
+		return 0;
+	if (name->len == 6 && memcmp(name->name, ".@cell", 6) == 0)
+		return 0;
+	return 1;
+}
+
 const struct dentry_operations afs_dynroot_dentry_operations = {
-	.d_delete	= always_delete_dentry,
-	.d_release	= afs_d_release,
+	.d_delete	= afs_dynroot_delete_dentry,
+	.d_release	= afs_dynroot_d_release,
 	.d_automount	= afs_d_automount,
 };
 
+static void afs_atcell_delayed_put_cell(void *arg)
+{
+	struct afs_cell *cell = arg;
+
+	afs_put_cell(cell, afs_cell_trace_put_atcell);
+}
+
 /*
- * Create a manually added cell mount directory.
- * - The caller must hold net->proc_cells_lock
+ * Read @cell or .@cell symlinks.
  */
-int afs_dynroot_mkdir(struct afs_net *net, struct afs_cell *cell)
+static const char *afs_atcell_get_link(struct dentry *dentry, struct inode *inode,
+				       struct delayed_call *done)
 {
-	struct super_block *sb = net->dynroot_sb;
-	struct dentry *root, *subdir;
-	int ret;
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	struct afs_cell *cell;
+	struct afs_net *net = afs_i2net(inode);
+	const char *name;
+	bool dotted = vnode->fid.vnode == 3;
 
-	if (!sb || atomic_read(&sb->s_active) == 0)
-		return 0;
+	if (!rcu_access_pointer(net->ws_cell))
+		return ERR_PTR(-ENOENT);
 
-	/* Let the ->lookup op do the creation */
-	root = sb->s_root;
-	inode_lock(root->d_inode);
-	subdir = lookup_one_len(cell->name, root, cell->name_len);
-	if (IS_ERR(subdir)) {
-		ret = PTR_ERR(subdir);
-		goto unlock;
+	if (!dentry) {
+		/* We're in RCU-pathwalk. */
+		cell = rcu_dereference(net->ws_cell);
+		if (dotted)
+			name = cell->name - 1;
+		else
+			name = cell->name;
+		/* Shouldn't need to set a delayed call. */
+		return name;
 	}
 
-	/* Note that we're retaining an extra ref on the dentry */
-	subdir->d_fsdata = (void *)1UL;
-	ret = 0;
-unlock:
-	inode_unlock(root->d_inode);
-	return ret;
+	down_read(&net->cells_lock);
+
+	cell = rcu_dereference_protected(net->ws_cell, lockdep_is_held(&net->cells_lock));
+	if (dotted)
+		name = cell->name - 1;
+	else
+		name = cell->name;
+	afs_get_cell(cell, afs_cell_trace_get_atcell);
+	set_delayed_call(done, afs_atcell_delayed_put_cell, cell);
+
+	up_read(&net->cells_lock);
+	return name;
 }
 
+static const struct inode_operations afs_atcell_inode_operations = {
+	.get_link	= afs_atcell_get_link,
+};
+
 /*
- * Remove a manually added cell mount directory.
- * - The caller must hold net->proc_cells_lock
+ * Create an inode for the @cell or .@cell symlinks.
  */
-void afs_dynroot_rmdir(struct afs_net *net, struct afs_cell *cell)
+static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry, ino_t ino)
 {
-	struct super_block *sb = net->dynroot_sb;
-	struct dentry *root, *subdir;
+	struct afs_vnode *vnode;
+	struct inode *inode;
+	struct afs_fid fid = { .vnode = ino, .unique = 1, };
 
-	if (!sb || atomic_read(&sb->s_active) == 0)
-		return;
+	inode = iget5_locked(dir->i_sb, fid.vnode,
+			     afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
 
-	root = sb->s_root;
-	inode_lock(root->d_inode);
+	vnode = AFS_FS_I(inode);
 
-	/* Don't want to trigger a lookup call, which will re-add the cell */
-	subdir = try_lookup_one_len(cell->name, root, cell->name_len);
-	if (IS_ERR_OR_NULL(subdir)) {
-		_debug("lookup %ld", PTR_ERR(subdir));
-		goto no_dentry;
+	if (inode_state_read_once(inode) & I_NEW) {
+		netfs_inode_init(&vnode->netfs, NULL, false);
+		simple_inode_init_ts(inode);
+		set_nlink(inode, 1);
+		inode->i_size		= 0;
+		inode->i_mode		= S_IFLNK | 0555;
+		inode->i_op		= &afs_atcell_inode_operations;
+		inode->i_uid		= GLOBAL_ROOT_UID;
+		inode->i_gid		= GLOBAL_ROOT_GID;
+		inode->i_blocks		= 0;
+		inode->i_generation	= 0;
+		inode->i_flags		|= S_NOATIME;
+
+		unlock_new_inode(inode);
 	}
+	return d_splice_alias(inode, dentry);
+}
 
-	_debug("rmdir %pd %u", subdir, d_count(subdir));
+/*
+ * Transcribe the cell database into readdir content under the RCU read lock.
+ * Each cell produces two entries, one prefixed with a dot and one not.
+ */
+static int afs_dynroot_readdir_cells(struct afs_net *net, struct dir_context *ctx)
+{
+	const struct afs_cell *cell;
+	loff_t newpos;
+
+	_enter("%llu", ctx->pos);
+
+	for (;;) {
+		unsigned int ix = ctx->pos >> 1;
+
+		cell = idr_get_next(&net->cells_dyn_ino, &ix);
+		if (!cell)
+			return 0;
+		if (READ_ONCE(cell->state) == AFS_CELL_REMOVING ||
+		    READ_ONCE(cell->state) == AFS_CELL_DEAD) {
+			ctx->pos += 2;
+			ctx->pos &= ~1;
+			continue;
+		}
+
+		newpos = ix << 1;
+		if (newpos > ctx->pos)
+			ctx->pos = newpos;
 
-	if (subdir->d_fsdata) {
-		_debug("unpin %u", d_count(subdir));
-		subdir->d_fsdata = NULL;
-		dput(subdir);
+		_debug("pos %llu -> cell %u", ctx->pos, cell->dynroot_ino);
+
+		if ((ctx->pos & 1) == 0) {
+			if (!dir_emit(ctx, cell->name, cell->name_len,
+				      cell->dynroot_ino, DT_DIR))
+				return 0;
+			ctx->pos++;
+		}
+		if ((ctx->pos & 1) == 1) {
+			if (!dir_emit(ctx, cell->name - 1, cell->name_len + 1,
+				      cell->dynroot_ino + 1, DT_DIR))
+				return 0;
+			ctx->pos++;
+		}
 	}
-	dput(subdir);
-no_dentry:
-	inode_unlock(root->d_inode);
-	_leave("");
+	return 0;
 }
 
 /*
- * Populate a newly created dynamic root with cell names.
+ * Read the AFS dynamic root directory.  This produces a list of cellnames,
+ * dotted and undotted, along with @cell and .@cell links if configured.
  */
-int afs_dynroot_populate(struct super_block *sb)
+static int afs_dynroot_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct afs_cell *cell;
-	struct afs_net *net = afs_sb2net(sb);
-	int ret;
+	struct afs_net *net = afs_d2net(file->f_path.dentry);
+	int ret = 0;
 
-	mutex_lock(&net->proc_cells_lock);
+	if (!dir_emit_dots(file, ctx))
+		return 0;
 
-	net->dynroot_sb = sb;
-	hlist_for_each_entry(cell, &net->proc_cells, proc_link) {
-		ret = afs_dynroot_mkdir(net, cell);
-		if (ret < 0)
-			goto error;
+	if (ctx->pos == 2) {
+		if (rcu_access_pointer(net->ws_cell) &&
+		    !dir_emit(ctx, "@cell", 5, 2, DT_LNK))
+			return 0;
+		ctx->pos = 3;
+	}
+	if (ctx->pos == 3) {
+		if (rcu_access_pointer(net->ws_cell) &&
+		    !dir_emit(ctx, ".@cell", 6, 3, DT_LNK))
+			return 0;
+		ctx->pos = 4;
 	}
 
-	ret = 0;
-out:
-	mutex_unlock(&net->proc_cells_lock);
+	if ((unsigned long long)ctx->pos <= AFS_MAX_DYNROOT_CELL_INO) {
+		down_read(&net->cells_lock);
+		ret = afs_dynroot_readdir_cells(net, ctx);
+		up_read(&net->cells_lock);
+	}
 	return ret;
-
-error:
-	net->dynroot_sb = NULL;
-	goto out;
 }
 
+static const struct file_operations afs_dynroot_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.iterate_shared	= afs_dynroot_readdir,
+	.fsync		= noop_fsync,
+};
+
 /*
- * When a dynamic root that's in the process of being destroyed, depopulate it
- * of pinned directories.
+ * Create an inode for a dynamic root directory.
  */
-void afs_dynroot_depopulate(struct super_block *sb)
+struct inode *afs_dynroot_iget_root(struct super_block *sb)
 {
-	struct afs_net *net = afs_sb2net(sb);
-	struct dentry *root = sb->s_root, *subdir;
-
-	/* Prevent more subdirs from being created */
-	mutex_lock(&net->proc_cells_lock);
-	if (net->dynroot_sb == sb)
-		net->dynroot_sb = NULL;
-	mutex_unlock(&net->proc_cells_lock);
-
-	if (root) {
-		struct hlist_node *n;
-		inode_lock(root->d_inode);
-
-		/* Remove all the pins for dirs created for manually added cells */
-		hlist_for_each_entry_safe(subdir, n, &root->d_children, d_sib) {
-			if (subdir->d_fsdata) {
-				subdir->d_fsdata = NULL;
-				dput(subdir);
-			}
-		}
+	struct afs_super_info *as = AFS_FS_S(sb);
+	struct afs_vnode *vnode;
+	struct inode *inode;
+	struct afs_fid fid = { .vid = 0, .vnode = 1, .unique = 1,};
+
+	if (as->volume)
+		fid.vid = as->volume->vid;
 
-		inode_unlock(root->d_inode);
+	inode = iget5_locked(sb, fid.vnode,
+			     afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	vnode = AFS_FS_I(inode);
+
+	/* there shouldn't be an existing inode */
+	if (inode_state_read_once(inode) & I_NEW) {
+		netfs_inode_init(&vnode->netfs, NULL, false);
+		simple_inode_init_ts(inode);
+		set_nlink(inode, 2);
+		inode->i_size		= 0;
+		inode->i_mode		= S_IFDIR | 0555;
+		inode->i_op		= &afs_dynroot_inode_operations;
+		inode->i_fop		= &afs_dynroot_file_operations;
+		inode->i_uid		= GLOBAL_ROOT_UID;
+		inode->i_gid		= GLOBAL_ROOT_GID;
+		inode->i_blocks		= 0;
+		inode->i_generation	= 0;
+		inode->i_flags		|= S_NOATIME;
+
+		set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
+		unlock_new_inode(inode);
 	}
+	_leave(" = %p", inode);
+	return inode;
 }
diff --git a/fs/afs/file.c b/fs/afs/file.c
index c3f0c45ae9a9..f66a92294284 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -16,10 +16,10 @@
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/netfs.h>
+#include <trace/events/netfs.h>
 #include "internal.h"
 
-static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
-static int afs_symlink_read_folio(struct file *file, struct folio *folio);
+static int afs_file_mmap_prepare(struct vm_area_desc *desc);
 
 static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter);
 static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos,
@@ -35,7 +35,7 @@ const struct file_operations afs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read_iter	= afs_file_read_iter,
 	.write_iter	= netfs_file_write_iter,
-	.mmap		= afs_file_mmap,
+	.mmap_prepare	= afs_file_mmap_prepare,
 	.splice_read	= afs_file_splice_read,
 	.splice_write	= iter_file_splice_write,
 	.fsync		= afs_fsync,
@@ -60,13 +60,6 @@ const struct address_space_operations afs_file_aops = {
 	.writepages	= afs_writepages,
 };
 
-const struct address_space_operations afs_symlink_aops = {
-	.read_folio	= afs_symlink_read_folio,
-	.release_folio	= netfs_release_folio,
-	.invalidate_folio = netfs_invalidate_folio,
-	.migrate_folio	= filemap_migrate_folio,
-};
-
 static const struct vm_operations_struct afs_vm_ops = {
 	.open		= afs_vm_open,
 	.close		= afs_vm_close,
@@ -207,47 +200,12 @@ int afs_release(struct inode *inode, struct file *file)
 	return ret;
 }
 
-/*
- * Allocate a new read record.
- */
-struct afs_read *afs_alloc_read(gfp_t gfp)
-{
-	struct afs_read *req;
-
-	req = kzalloc(sizeof(struct afs_read), gfp);
-	if (req)
-		refcount_set(&req->usage, 1);
-
-	return req;
-}
-
-/*
- * Dispose of a ref to a read record.
- */
-void afs_put_read(struct afs_read *req)
-{
-	if (refcount_dec_and_test(&req->usage)) {
-		if (req->cleanup)
-			req->cleanup(req);
-		key_put(req->key);
-		kfree(req);
-	}
-}
-
 static void afs_fetch_data_notify(struct afs_operation *op)
 {
-	struct afs_read *req = op->fetch.req;
-	struct netfs_io_subrequest *subreq = req->subreq;
-	int error = afs_op_error(op);
-
-	req->error = error;
-	if (subreq) {
-		__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
-		netfs_subreq_terminated(subreq, error ?: req->actual_len, false);
-		req->subreq = NULL;
-	} else if (req->done) {
-		req->done(req);
-	}
+	struct netfs_io_subrequest *subreq = op->fetch.subreq;
+
+	subreq->error = afs_op_error(op);
+	netfs_read_subreq_terminated(subreq);
 }
 
 static void afs_fetch_data_success(struct afs_operation *op)
@@ -257,103 +215,198 @@ static void afs_fetch_data_success(struct afs_operation *op)
 	_enter("op=%08x", op->debug_id);
 	afs_vnode_commit_status(op, &op->file[0]);
 	afs_stat_v(vnode, n_fetches);
-	atomic_long_add(op->fetch.req->actual_len, &op->net->n_fetch_bytes);
+	atomic_long_add(op->fetch.subreq->transferred, &op->net->n_fetch_bytes);
 	afs_fetch_data_notify(op);
 }
 
-static void afs_fetch_data_put(struct afs_operation *op)
+static void afs_fetch_data_aborted(struct afs_operation *op)
 {
-	op->fetch.req->error = afs_op_error(op);
-	afs_put_read(op->fetch.req);
+	afs_check_for_remote_deletion(op);
+	afs_fetch_data_notify(op);
 }
 
-static const struct afs_operation_ops afs_fetch_data_operation = {
+const struct afs_operation_ops afs_fetch_data_operation = {
 	.issue_afs_rpc	= afs_fs_fetch_data,
 	.issue_yfs_rpc	= yfs_fs_fetch_data,
 	.success	= afs_fetch_data_success,
-	.aborted	= afs_check_for_remote_deletion,
+	.aborted	= afs_fetch_data_aborted,
 	.failed		= afs_fetch_data_notify,
-	.put		= afs_fetch_data_put,
 };
 
+static void afs_issue_read_call(struct afs_operation *op)
+{
+	op->call_responded = false;
+	op->call_error = 0;
+	op->call_abort_code = 0;
+	if (test_bit(AFS_SERVER_FL_IS_YFS, &op->server->flags))
+		yfs_fs_fetch_data(op);
+	else
+		afs_fs_fetch_data(op);
+}
+
+static void afs_end_read(struct afs_operation *op)
+{
+	if (op->call_responded && op->server)
+		set_bit(AFS_SERVER_FL_RESPONDING, &op->server->flags);
+
+	if (!afs_op_error(op))
+		afs_fetch_data_success(op);
+	else if (op->cumul_error.aborted)
+		afs_fetch_data_aborted(op);
+	else
+		afs_fetch_data_notify(op);
+
+	afs_end_vnode_operation(op);
+	afs_put_operation(op);
+}
+
+/*
+ * Perform I/O processing on an asynchronous call.  The work item carries a ref
+ * to the call struct that we either need to release or to pass on.
+ */
+static void afs_read_receive(struct afs_call *call)
+{
+	struct afs_operation *op = call->op;
+	enum afs_call_state state;
+
+	_enter("");
+
+	state = READ_ONCE(call->state);
+	if (state == AFS_CALL_COMPLETE)
+		return;
+	trace_afs_read_recv(op, call);
+
+	while (state < AFS_CALL_COMPLETE && READ_ONCE(call->need_attention)) {
+		WRITE_ONCE(call->need_attention, false);
+		afs_deliver_to_call(call);
+		state = READ_ONCE(call->state);
+	}
+
+	if (state < AFS_CALL_COMPLETE) {
+		netfs_read_subreq_progress(op->fetch.subreq);
+		if (rxrpc_kernel_check_life(call->net->socket, call->rxcall))
+			return;
+		/* rxrpc terminated the call. */
+		afs_set_call_complete(call, call->error, call->abort_code);
+	}
+
+	op->call_abort_code	= call->abort_code;
+	op->call_error		= call->error;
+	op->call_responded	= call->responded;
+	op->call		= NULL;
+	call->op		= NULL;
+	afs_put_call(call);
+
+	/* If the call failed, then we need to crank the server rotation
+	 * handle and try the next.
+	 */
+	if (afs_select_fileserver(op)) {
+		afs_issue_read_call(op);
+		return;
+	}
+
+	afs_end_read(op);
+}
+
+void afs_fetch_data_async_rx(struct work_struct *work)
+{
+	struct afs_call *call = container_of(work, struct afs_call, async_work);
+
+	afs_read_receive(call);
+	afs_put_call(call);
+}
+
+void afs_fetch_data_immediate_cancel(struct afs_call *call)
+{
+	if (call->async) {
+		afs_get_call(call, afs_call_trace_wake);
+		if (!queue_work(afs_async_calls, &call->async_work))
+			afs_deferred_put_call(call);
+		flush_work(&call->async_work);
+	}
+}
+
 /*
  * Fetch file data from the volume.
  */
-int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req)
+static void afs_issue_read(struct netfs_io_subrequest *subreq)
 {
 	struct afs_operation *op;
+	struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);
+	struct key *key = subreq->rreq->netfs_priv;
 
 	_enter("%s{%llx:%llu.%u},%x,,,",
 	       vnode->volume->name,
 	       vnode->fid.vid,
 	       vnode->fid.vnode,
 	       vnode->fid.unique,
-	       key_serial(req->key));
+	       key_serial(key));
 
-	op = afs_alloc_operation(req->key, vnode->volume);
+	op = afs_alloc_operation(key, vnode->volume);
 	if (IS_ERR(op)) {
-		if (req->subreq)
-			netfs_subreq_terminated(req->subreq, PTR_ERR(op), false);
-		return PTR_ERR(op);
+		subreq->error = PTR_ERR(op);
+		netfs_read_subreq_terminated(subreq);
+		return;
 	}
 
 	afs_op_set_vnode(op, 0, vnode);
 
-	op->fetch.req	= afs_get_read(req);
+	op->fetch.subreq = subreq;
 	op->ops		= &afs_fetch_data_operation;
-	return afs_do_sync_operation(op);
-}
-
-static void afs_issue_read(struct netfs_io_subrequest *subreq)
-{
-	struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);
-	struct afs_read *fsreq;
 
-	fsreq = afs_alloc_read(GFP_NOFS);
-	if (!fsreq)
-		return netfs_subreq_terminated(subreq, -ENOMEM, false);
+	trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
 
-	fsreq->subreq	= subreq;
-	fsreq->pos	= subreq->start + subreq->transferred;
-	fsreq->len	= subreq->len   - subreq->transferred;
-	fsreq->key	= key_get(subreq->rreq->netfs_priv);
-	fsreq->vnode	= vnode;
-	fsreq->iter	= &subreq->io_iter;
+	if (subreq->rreq->origin == NETFS_READAHEAD ||
+	    subreq->rreq->iocb) {
+		op->flags |= AFS_OPERATION_ASYNC;
 
-	afs_fetch_data(fsreq->vnode, fsreq);
-	afs_put_read(fsreq);
-}
+		if (!afs_begin_vnode_operation(op)) {
+			subreq->error = afs_put_operation(op);
+			netfs_read_subreq_terminated(subreq);
+			return;
+		}
 
-static int afs_symlink_read_folio(struct file *file, struct folio *folio)
-{
-	struct afs_vnode *vnode = AFS_FS_I(folio->mapping->host);
-	struct afs_read *fsreq;
-	int ret;
-
-	fsreq = afs_alloc_read(GFP_NOFS);
-	if (!fsreq)
-		return -ENOMEM;
+		if (!afs_select_fileserver(op)) {
+			afs_end_read(op);
+			return;
+		}
 
-	fsreq->pos	= folio_pos(folio);
-	fsreq->len	= folio_size(folio);
-	fsreq->vnode	= vnode;
-	fsreq->iter	= &fsreq->def_iter;
-	iov_iter_xarray(&fsreq->def_iter, ITER_DEST, &folio->mapping->i_pages,
-			fsreq->pos, fsreq->len);
-
-	ret = afs_fetch_data(fsreq->vnode, fsreq);
-	if (ret == 0)
-		folio_mark_uptodate(folio);
-	folio_unlock(folio);
-	return ret;
+		afs_issue_read_call(op);
+	} else {
+		afs_do_sync_operation(op);
+	}
 }
 
 static int afs_init_request(struct netfs_io_request *rreq, struct file *file)
 {
+	struct afs_vnode *vnode = AFS_FS_I(rreq->inode);
+
 	if (file)
 		rreq->netfs_priv = key_get(afs_file_key(file));
 	rreq->rsize = 256 * 1024;
 	rreq->wsize = 256 * 1024 * 1024;
+
+	switch (rreq->origin) {
+	case NETFS_READ_SINGLE:
+		if (!file) {
+			struct key *key = afs_request_key(vnode->volume->cell);
+
+			if (IS_ERR(key))
+				return PTR_ERR(key);
+			rreq->netfs_priv = key;
+		}
+		break;
+	case NETFS_WRITEBACK:
+	case NETFS_WRITETHROUGH:
+	case NETFS_UNBUFFERED_WRITE:
+	case NETFS_DIO_WRITE:
+		if (S_ISREG(rreq->inode->i_mode))
+			rreq->io_streams[0].avail = true;
+		break;
+	case NETFS_WRITEBACK_SINGLE:
+	default:
+		break;
+	}
 	return 0;
 }
 
@@ -403,6 +456,7 @@ const struct netfs_request_ops afs_req_ops = {
 	.begin_writeback	= afs_begin_writeback,
 	.prepare_write		= afs_prepare_write,
 	.issue_write		= afs_issue_write,
+	.retry_request		= afs_retry_request,
 };
 
 static void afs_add_open_mmap(struct afs_vnode *vnode)
@@ -438,16 +492,16 @@ static void afs_drop_open_mmap(struct afs_vnode *vnode)
 /*
  * Handle setting up a memory mapping on an AFS file.
  */
-static int afs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int afs_file_mmap_prepare(struct vm_area_desc *desc)
 {
-	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
+	struct afs_vnode *vnode = AFS_FS_I(file_inode(desc->file));
 	int ret;
 
 	afs_add_open_mmap(vnode);
 
-	ret = generic_file_mmap(file, vma);
+	ret = generic_file_mmap_prepare(desc);
 	if (ret == 0)
-		vma->vm_ops = &afs_vm_ops;
+		desc->vm_ops = &afs_vm_ops;
 	else
 		afs_drop_open_mmap(vnode);
 	return ret;
diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
index 3546b087e791..8418813ee043 100644
--- a/fs/afs/fs_operation.c
+++ b/fs/afs/fs_operation.c
@@ -49,6 +49,105 @@ struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *vo
 	return op;
 }
 
+struct afs_io_locker {
+	struct list_head	link;
+	struct task_struct	*task;
+	unsigned long		have_lock;
+};
+
+/*
+ * Unlock the I/O lock on a vnode.
+ */
+static void afs_unlock_for_io(struct afs_vnode *vnode)
+{
+	struct afs_io_locker *locker;
+
+	spin_lock(&vnode->lock);
+	locker = list_first_entry_or_null(&vnode->io_lock_waiters,
+					  struct afs_io_locker, link);
+	if (locker) {
+		list_del(&locker->link);
+		smp_store_release(&locker->have_lock, 1); /* The unlock barrier. */
+		smp_mb__after_atomic(); /* Store have_lock before task state */
+		wake_up_process(locker->task);
+	} else {
+		clear_bit(AFS_VNODE_IO_LOCK, &vnode->flags);
+	}
+	spin_unlock(&vnode->lock);
+}
+
+/*
+ * Lock the I/O lock on a vnode uninterruptibly.  We can't use an ordinary
+ * mutex as lockdep will complain if we unlock it in the wrong thread.
+ */
+static void afs_lock_for_io(struct afs_vnode *vnode)
+{
+	struct afs_io_locker myself = { .task = current, };
+
+	spin_lock(&vnode->lock);
+
+	if (!test_and_set_bit(AFS_VNODE_IO_LOCK, &vnode->flags)) {
+		spin_unlock(&vnode->lock);
+		return;
+	}
+
+	list_add_tail(&myself.link, &vnode->io_lock_waiters);
+	spin_unlock(&vnode->lock);
+
+	for (;;) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (smp_load_acquire(&myself.have_lock)) /* The lock barrier */
+			break;
+		schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+}
+
+/*
+ * Lock the I/O lock on a vnode interruptibly.  We can't use an ordinary mutex
+ * as lockdep will complain if we unlock it in the wrong thread.
+ */
+static int afs_lock_for_io_interruptible(struct afs_vnode *vnode)
+{
+	struct afs_io_locker myself = { .task = current, };
+	int ret = 0;
+
+	spin_lock(&vnode->lock);
+
+	if (!test_and_set_bit(AFS_VNODE_IO_LOCK, &vnode->flags)) {
+		spin_unlock(&vnode->lock);
+		return 0;
+	}
+
+	list_add_tail(&myself.link, &vnode->io_lock_waiters);
+	spin_unlock(&vnode->lock);
+
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (smp_load_acquire(&myself.have_lock) || /* The lock barrier */
+		    signal_pending(current))
+			break;
+		schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+
+	/* If we got a signal, try to transfer the lock onto the next
+	 * waiter.
+	 */
+	if (unlikely(signal_pending(current))) {
+		spin_lock(&vnode->lock);
+		if (myself.have_lock) {
+			spin_unlock(&vnode->lock);
+			afs_unlock_for_io(vnode);
+		} else {
+			list_del(&myself.link);
+			spin_unlock(&vnode->lock);
+		}
+		ret = -ERESTARTSYS;
+	}
+	return ret;
+}
+
 /*
  * Lock the vnode(s) being operated upon.
  */
@@ -60,7 +159,7 @@ static bool afs_get_io_locks(struct afs_operation *op)
 	_enter("");
 
 	if (op->flags & AFS_OPERATION_UNINTR) {
-		mutex_lock(&vnode->io_lock);
+		afs_lock_for_io(vnode);
 		op->flags |= AFS_OPERATION_LOCK_0;
 		_leave(" = t [1]");
 		return true;
@@ -72,7 +171,7 @@ static bool afs_get_io_locks(struct afs_operation *op)
 	if (vnode2 > vnode)
 		swap(vnode, vnode2);
 
-	if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
+	if (afs_lock_for_io_interruptible(vnode) < 0) {
 		afs_op_set_error(op, -ERESTARTSYS);
 		op->flags |= AFS_OPERATION_STOP;
 		_leave(" = f [I 0]");
@@ -81,10 +180,10 @@ static bool afs_get_io_locks(struct afs_operation *op)
 	op->flags |= AFS_OPERATION_LOCK_0;
 
 	if (vnode2) {
-		if (mutex_lock_interruptible_nested(&vnode2->io_lock, 1) < 0) {
+		if (afs_lock_for_io_interruptible(vnode2) < 0) {
 			afs_op_set_error(op, -ERESTARTSYS);
 			op->flags |= AFS_OPERATION_STOP;
-			mutex_unlock(&vnode->io_lock);
+			afs_unlock_for_io(vnode);
 			op->flags &= ~AFS_OPERATION_LOCK_0;
 			_leave(" = f [I 1]");
 			return false;
@@ -104,9 +203,9 @@ static void afs_drop_io_locks(struct afs_operation *op)
 	_enter("");
 
 	if (op->flags & AFS_OPERATION_LOCK_1)
-		mutex_unlock(&vnode2->io_lock);
+		afs_unlock_for_io(vnode2);
 	if (op->flags & AFS_OPERATION_LOCK_0)
-		mutex_unlock(&vnode->io_lock);
+		afs_unlock_for_io(vnode);
 }
 
 static void afs_prepare_vnode(struct afs_operation *op, struct afs_vnode_param *vp,
@@ -157,7 +256,7 @@ bool afs_begin_vnode_operation(struct afs_operation *op)
 /*
  * Tidy up a filesystem cursor and unlock the vnode.
  */
-static void afs_end_vnode_operation(struct afs_operation *op)
+void afs_end_vnode_operation(struct afs_operation *op)
 {
 	_enter("");
 
@@ -201,7 +300,7 @@ void afs_wait_for_operation(struct afs_operation *op)
 		}
 	}
 
-	if (op->call_responded)
+	if (op->call_responded && op->server)
 		set_bit(AFS_SERVER_FL_RESPONDING, &op->server->flags);
 
 	if (!afs_op_error(op)) {
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index 580de4adaaf6..e0030ac74ea0 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -235,20 +235,20 @@ out:
  * Probe all of a fileserver's addresses to find out the best route and to
  * query its capabilities.
  */
-void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
-			     struct afs_addr_list *new_alist, struct key *key)
+int afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
+			    struct afs_addr_list *new_alist, struct key *key)
 {
 	struct afs_endpoint_state *estate, *old;
-	struct afs_addr_list *alist;
+	struct afs_addr_list *old_alist = NULL, *alist;
 	unsigned long unprobed;
 
 	_enter("%pU", &server->uuid);
 
 	estate = kzalloc(sizeof(*estate), GFP_KERNEL);
 	if (!estate)
-		return;
+		return -ENOMEM;
 
-	refcount_set(&estate->ref, 1);
+	refcount_set(&estate->ref, 2);
 	estate->server_id = server->debug_id;
 	estate->rtt = UINT_MAX;
 
@@ -256,21 +256,31 @@ void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
 
 	old = rcu_dereference_protected(server->endpoint_state,
 					lockdep_is_held(&server->fs_lock));
-	estate->responsive_set = old->responsive_set;
-	estate->addresses = afs_get_addrlist(new_alist ?: old->addresses,
-					     afs_alist_trace_get_estate);
+	if (old) {
+		estate->responsive_set = old->responsive_set;
+		if (!new_alist)
+			new_alist = old->addresses;
+	}
+
+	if (old_alist != new_alist)
+		afs_set_peer_appdata(server, old_alist, new_alist);
+
+	estate->addresses = afs_get_addrlist(new_alist, afs_alist_trace_get_estate);
 	alist = estate->addresses;
 	estate->probe_seq = ++server->probe_counter;
 	atomic_set(&estate->nr_probing, alist->nr_addrs);
 
+	if (new_alist)
+		server->addr_version = new_alist->version;
 	rcu_assign_pointer(server->endpoint_state, estate);
-	set_bit(AFS_ESTATE_SUPERSEDED, &old->flags);
 	write_unlock(&server->fs_lock);
+	if (old)
+		set_bit(AFS_ESTATE_SUPERSEDED, &old->flags);
 
 	trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref),
 			 afs_estate_trace_alloc_probe);
 
-	afs_get_address_preferences(net, alist);
+	afs_get_address_preferences(net, new_alist);
 
 	server->probed_at = jiffies;
 	unprobed = (1UL << alist->nr_addrs) - 1;
@@ -293,6 +303,8 @@ void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
 	}
 
 	afs_put_endpoint_state(old, afs_estate_trace_put_probe);
+	afs_put_endpoint_state(estate, afs_estate_trace_put_probe);
+	return 0;
 }
 
 /*
@@ -506,10 +518,10 @@ int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_sta
 	finish_wait(&server->probe_wq, &wait);
 
 dont_wait:
-	if (estate->responsive_set & ~exclude)
-		return 1;
 	if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags))
 		return 0;
+	if (estate->responsive_set & ~exclude)
+		return 1;
 	if (is_intr && signal_pending(current))
 		return -ERESTARTSYS;
 	if (timo == 0)
@@ -522,6 +534,6 @@ dont_wait:
  */
 void afs_fs_probe_cleanup(struct afs_net *net)
 {
-	if (del_timer_sync(&net->fs_probe_timer))
+	if (timer_delete_sync(&net->fs_probe_timer))
 		afs_dec_servers_outstanding(net);
 }
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 79cd30775b7a..bc9556991d7c 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -301,18 +301,19 @@ void afs_fs_fetch_status(struct afs_operation *op)
 static int afs_deliver_fs_fetch_data(struct afs_call *call)
 {
 	struct afs_operation *op = call->op;
+	struct netfs_io_subrequest *subreq = op->fetch.subreq;
 	struct afs_vnode_param *vp = &op->file[0];
-	struct afs_read *req = op->fetch.req;
 	const __be32 *bp;
+	size_t count_before;
 	int ret;
 
 	_enter("{%u,%zu,%zu/%llu}",
 	       call->unmarshall, call->iov_len, iov_iter_count(call->iter),
-	       req->actual_len);
+	       call->remaining);
 
 	switch (call->unmarshall) {
 	case 0:
-		req->actual_len = 0;
+		call->remaining = 0;
 		call->unmarshall++;
 		if (call->operation_ID == FSFETCHDATA64) {
 			afs_extract_to_tmp64(call);
@@ -322,8 +323,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 		}
 		fallthrough;
 
-		/* Extract the returned data length into
-		 * ->actual_len.  This may indicate more or less data than was
+		/* Extract the returned data length into ->remaining.
+		 * This may indicate more or less data than was
 		 * requested will be returned.
 		 */
 	case 1:
@@ -332,38 +333,40 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 		if (ret < 0)
 			return ret;
 
-		req->actual_len = be64_to_cpu(call->tmp64);
-		_debug("DATA length: %llu", req->actual_len);
+		call->remaining = be64_to_cpu(call->tmp64);
+		_debug("DATA length: %llu", call->remaining);
 
-		if (req->actual_len == 0)
+		if (call->remaining == 0)
 			goto no_more_data;
 
-		call->iter = req->iter;
-		call->iov_len = min(req->actual_len, req->len);
+		call->iter = &subreq->io_iter;
+		call->iov_len = umin(call->remaining, subreq->len - subreq->transferred);
 		call->unmarshall++;
 		fallthrough;
 
 		/* extract the returned data */
 	case 2:
-		_debug("extract data %zu/%llu",
-		       iov_iter_count(call->iter), req->actual_len);
+		count_before = call->iov_len;
+		_debug("extract data %zu/%llu", count_before, call->remaining);
 
 		ret = afs_extract_data(call, true);
+		subreq->transferred += count_before - call->iov_len;
+		call->remaining -= count_before - call->iov_len;
 		if (ret < 0)
 			return ret;
 
 		call->iter = &call->def_iter;
-		if (req->actual_len <= req->len)
+		if (call->remaining)
 			goto no_more_data;
 
 		/* Discard any excess data the server gave us */
-		afs_extract_discard(call, req->actual_len - req->len);
+		afs_extract_discard(call, call->remaining);
 		call->unmarshall = 3;
 		fallthrough;
 
 	case 3:
 		_debug("extract discard %zu/%llu",
-		       iov_iter_count(call->iter), req->actual_len - req->len);
+		       iov_iter_count(call->iter), call->remaining);
 
 		ret = afs_extract_data(call, true);
 		if (ret < 0)
@@ -385,8 +388,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 		xdr_decode_AFSCallBack(&bp, call, &vp->scb);
 		xdr_decode_AFSVolSync(&bp, &op->volsync);
 
-		req->data_version = vp->scb.status.data_version;
-		req->file_size = vp->scb.status.size;
+		if (subreq->start + subreq->transferred >= vp->scb.status.size)
+			__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
 
 		call->unmarshall++;
 		fallthrough;
@@ -405,14 +408,18 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 static const struct afs_call_type afs_RXFSFetchData = {
 	.name		= "FS.FetchData",
 	.op		= afs_FS_FetchData,
+	.async_rx	= afs_fetch_data_async_rx,
 	.deliver	= afs_deliver_fs_fetch_data,
+	.immediate_cancel = afs_fetch_data_immediate_cancel,
 	.destructor	= afs_flat_call_destructor,
 };
 
 static const struct afs_call_type afs_RXFSFetchData64 = {
 	.name		= "FS.FetchData64",
 	.op		= afs_FS_FetchData64,
+	.async_rx	= afs_fetch_data_async_rx,
 	.deliver	= afs_deliver_fs_fetch_data,
+	.immediate_cancel = afs_fetch_data_immediate_cancel,
 	.destructor	= afs_flat_call_destructor,
 };
 
@@ -421,8 +428,8 @@ static const struct afs_call_type afs_RXFSFetchData64 = {
  */
 static void afs_fs_fetch_data64(struct afs_operation *op)
 {
+	struct netfs_io_subrequest *subreq = op->fetch.subreq;
 	struct afs_vnode_param *vp = &op->file[0];
-	struct afs_read *req = op->fetch.req;
 	struct afs_call *call;
 	__be32 *bp;
 
@@ -432,16 +439,19 @@ static void afs_fs_fetch_data64(struct afs_operation *op)
 	if (!call)
 		return afs_op_nomem(op);
 
+	if (op->flags & AFS_OPERATION_ASYNC)
+		call->async = true;
+
 	/* marshall the parameters */
 	bp = call->request;
 	bp[0] = htonl(FSFETCHDATA64);
 	bp[1] = htonl(vp->fid.vid);
 	bp[2] = htonl(vp->fid.vnode);
 	bp[3] = htonl(vp->fid.unique);
-	bp[4] = htonl(upper_32_bits(req->pos));
-	bp[5] = htonl(lower_32_bits(req->pos));
+	bp[4] = htonl(upper_32_bits(subreq->start + subreq->transferred));
+	bp[5] = htonl(lower_32_bits(subreq->start + subreq->transferred));
 	bp[6] = 0;
-	bp[7] = htonl(lower_32_bits(req->len));
+	bp[7] = htonl(lower_32_bits(subreq->len   - subreq->transferred));
 
 	call->fid = vp->fid;
 	trace_afs_make_fs_call(call, &vp->fid);
@@ -453,9 +463,9 @@ static void afs_fs_fetch_data64(struct afs_operation *op)
  */
 void afs_fs_fetch_data(struct afs_operation *op)
 {
+	struct netfs_io_subrequest *subreq = op->fetch.subreq;
 	struct afs_vnode_param *vp = &op->file[0];
 	struct afs_call *call;
-	struct afs_read *req = op->fetch.req;
 	__be32 *bp;
 
 	if (test_bit(AFS_SERVER_FL_HAS_FS64, &op->server->flags))
@@ -467,16 +477,14 @@ void afs_fs_fetch_data(struct afs_operation *op)
 	if (!call)
 		return afs_op_nomem(op);
 
-	req->call_debug_id = call->debug_id;
-
 	/* marshall the parameters */
 	bp = call->request;
 	bp[0] = htonl(FSFETCHDATA);
 	bp[1] = htonl(vp->fid.vid);
 	bp[2] = htonl(vp->fid.vnode);
 	bp[3] = htonl(vp->fid.unique);
-	bp[4] = htonl(lower_32_bits(req->pos));
-	bp[5] = htonl(lower_32_bits(req->len));
+	bp[4] = htonl(lower_32_bits(subreq->start + subreq->transferred));
+	bp[5] = htonl(lower_32_bits(subreq->len   + subreq->transferred));
 
 	call->fid = vp->fid;
 	trace_afs_make_fs_call(call, &vp->fid);
@@ -1645,7 +1653,7 @@ int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server,
 	bp = call->request;
 	*bp++ = htonl(FSGIVEUPALLCALLBACKS);
 
-	call->server = afs_use_server(server, afs_server_trace_give_up_cb);
+	call->server = afs_use_server(server, false, afs_server_trace_use_give_up_cb);
 	afs_make_call(call, GFP_NOFS);
 	afs_wait_for_call_to_complete(call);
 	ret = call->error;
@@ -1728,6 +1736,7 @@ static const struct afs_call_type afs_RXFSGetCapabilities = {
 	.op		= afs_FS_GetCapabilities,
 	.deliver	= afs_deliver_fs_get_capabilities,
 	.done		= afs_fileserver_probe_result,
+	.immediate_cancel = afs_fileserver_probe_result,
 	.destructor	= afs_fs_get_capabilities_destructor,
 };
 
@@ -1751,7 +1760,7 @@ bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server,
 		return false;
 
 	call->key	= key;
-	call->server	= afs_use_server(server, afs_server_trace_get_caps);
+	call->server	= afs_use_server(server, false, afs_server_trace_use_get_caps);
 	call->peer	= rxrpc_kernel_get_peer(estate->addresses->addrs[addr_index].peer);
 	call->probe	= afs_get_endpoint_state(estate, afs_estate_trace_get_getcaps);
 	call->probe_index = addr_index;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 94fc049aff58..dde1857fcabb 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -25,8 +25,94 @@
 #include "internal.h"
 #include "afs_fs.h"
 
+void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op)
+{
+	size_t size = strlen(op->create.symlink) + 1;
+	size_t dsize = 0;
+	char *p;
+
+	if (netfs_alloc_folioq_buffer(NULL, &vnode->directory, &dsize, size,
+				      mapping_gfp_mask(vnode->netfs.inode.i_mapping)) < 0)
+		return;
+
+	vnode->directory_size = dsize;
+	p = kmap_local_folio(folioq_folio(vnode->directory, 0), 0);
+	memcpy(p, op->create.symlink, size);
+	kunmap_local(p);
+	set_bit(AFS_VNODE_DIR_READ, &vnode->flags);
+	netfs_single_mark_inode_dirty(&vnode->netfs.inode);
+}
+
+static void afs_put_link(void *arg)
+{
+	struct folio *folio = virt_to_folio(arg);
+
+	kunmap_local(arg);
+	folio_put(folio);
+}
+
+const char *afs_get_link(struct dentry *dentry, struct inode *inode,
+			 struct delayed_call *callback)
+{
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	struct folio *folio;
+	char *content;
+	ssize_t ret;
+
+	if (!dentry) {
+		/* RCU pathwalk. */
+		if (!test_bit(AFS_VNODE_DIR_READ, &vnode->flags) || !afs_check_validity(vnode))
+			return ERR_PTR(-ECHILD);
+		goto good;
+	}
+
+	if (test_bit(AFS_VNODE_DIR_READ, &vnode->flags))
+		goto fetch;
+
+	ret = afs_validate(vnode, NULL);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	if (!test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) &&
+	    test_bit(AFS_VNODE_DIR_READ, &vnode->flags))
+		goto good;
+
+fetch:
+	ret = afs_read_single(vnode, NULL);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	set_bit(AFS_VNODE_DIR_READ, &vnode->flags);
+
+good:
+	folio = folioq_folio(vnode->directory, 0);
+	folio_get(folio);
+	content = kmap_local_folio(folio, 0);
+	set_delayed_call(callback, afs_put_link, content);
+	return content;
+}
+
+int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
+{
+	DEFINE_DELAYED_CALL(done);
+	const char *content;
+	int len;
+
+	content = afs_get_link(dentry, d_inode(dentry), &done);
+	if (IS_ERR(content)) {
+		do_delayed_call(&done);
+		return PTR_ERR(content);
+	}
+
+	len = umin(strlen(content), buflen);
+	if (copy_to_user(buffer, content, len))
+		len = -EFAULT;
+	do_delayed_call(&done);
+	return len;
+}
+
 static const struct inode_operations afs_symlink_inode_operations = {
-	.get_link	= page_get_link,
+	.get_link	= afs_get_link,
+	.readlink	= afs_readlink,
 };
 
 static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *parent_vnode)
@@ -110,7 +196,9 @@ static int afs_inode_init_from_status(struct afs_operation *op,
 		inode->i_op	= &afs_dir_inode_operations;
 		inode->i_fop	= &afs_dir_file_operations;
 		inode->i_mapping->a_ops	= &afs_dir_aops;
-		mapping_set_large_folios(inode->i_mapping);
+		__set_bit(NETFS_ICTX_SINGLE_NO_UPLOAD, &vnode->netfs.flags);
+		/* Assume locally cached directory data will be valid. */
+		__set_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
 		break;
 	case AFS_FTYPE_SYMLINK:
 		/* Symlinks with a mode of 0644 are actually mountpoints. */
@@ -122,13 +210,13 @@ static int afs_inode_init_from_status(struct afs_operation *op,
 			inode->i_mode	= S_IFDIR | 0555;
 			inode->i_op	= &afs_mntpt_inode_operations;
 			inode->i_fop	= &afs_mntpt_file_operations;
-			inode->i_mapping->a_ops	= &afs_symlink_aops;
 		} else {
 			inode->i_mode	= S_IFLNK | status->mode;
 			inode->i_op	= &afs_symlink_inode_operations;
-			inode->i_mapping->a_ops	= &afs_symlink_aops;
 		}
+		inode->i_mapping->a_ops	= &afs_dir_aops;
 		inode_nohighmem(inode);
+		mapping_set_release_always(inode->i_mapping);
 		break;
 	default:
 		dump_vnode(vnode, op->file[0].vnode != vnode ? op->file[0].vnode : NULL);
@@ -140,15 +228,17 @@ static int afs_inode_init_from_status(struct afs_operation *op,
 	afs_set_netfs_context(vnode);
 
 	vnode->invalid_before	= status->data_version;
+	trace_afs_set_dv(vnode, status->data_version);
 	inode_set_iversion_raw(&vnode->netfs.inode, status->data_version);
 
 	if (!vp->scb.have_cb) {
 		/* it's a symlink we just created (the fileserver
 		 * didn't give us a callback) */
-		atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
+		afs_clear_cb_promise(vnode, afs_cb_promise_set_new_symlink);
 	} else {
 		vnode->cb_server = op->server;
-		atomic64_set(&vnode->cb_expires_at, vp->scb.callback.expires_at);
+		afs_set_cb_promise(vnode, vp->scb.callback.expires_at,
+				   afs_cb_promise_set_new_inode);
 	}
 
 	write_sequnlock(&vnode->cb_lock);
@@ -207,12 +297,17 @@ static void afs_apply_status(struct afs_operation *op,
 	if (vp->update_ctime)
 		inode_set_ctime_to_ts(inode, op->ctime);
 
-	if (vnode->status.data_version != status->data_version)
+	if (vnode->status.data_version != status->data_version) {
+		trace_afs_set_dv(vnode, status->data_version);
 		data_changed = true;
+	}
 
 	vnode->status = *status;
 
 	if (vp->dv_before + vp->dv_delta != status->data_version) {
+		trace_afs_dv_mismatch(vnode, vp->dv_before, vp->dv_delta,
+				      status->data_version);
+
 		if (vnode->cb_ro_snapshot == atomic_read(&vnode->volume->cb_ro_snapshot) &&
 		    atomic64_read(&vnode->cb_expires_at) != AFS_NO_CB_PROMISE)
 			pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s (op=%x)\n",
@@ -223,12 +318,10 @@ static void afs_apply_status(struct afs_operation *op,
 				op->debug_id);
 
 		vnode->invalid_before = status->data_version;
-		if (vnode->status.type == AFS_FTYPE_DIR) {
-			if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
-				afs_stat_v(vnode, n_inval);
-		} else {
+		if (vnode->status.type == AFS_FTYPE_DIR)
+			afs_invalidate_dir(vnode, afs_dir_invalid_dv_mismatch);
+		else
 			set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
-		}
 		change_size = true;
 		data_changed = true;
 		unexpected_jump = true;
@@ -258,6 +351,8 @@ static void afs_apply_status(struct afs_operation *op,
 			inode_set_ctime_to_ts(inode, t);
 			inode_set_atime_to_ts(inode, t);
 		}
+		if (op->ops == &afs_fetch_data_operation)
+			op->fetch.subreq->rreq->i_size = status->size;
 	}
 }
 
@@ -273,7 +368,7 @@ static void afs_apply_callback(struct afs_operation *op,
 	if (!afs_cb_is_broken(vp->cb_break_before, vnode)) {
 		if (op->volume->type == AFSVL_RWVOL)
 			vnode->cb_server = op->server;
-		atomic64_set(&vnode->cb_expires_at, cb->expires_at);
+		afs_set_cb_promise(vnode, cb->expires_at, afs_cb_promise_set_apply_cb);
 	}
 }
 
@@ -332,7 +427,7 @@ static void afs_fetch_status_success(struct afs_operation *op)
 	struct afs_vnode *vnode = vp->vnode;
 	int ret;
 
-	if (vnode->netfs.inode.i_state & I_NEW) {
+	if (inode_state_read_once(&vnode->netfs.inode) & I_NEW) {
 		ret = afs_inode_init_from_status(op, vp, vnode);
 		afs_op_set_error(op, ret);
 		if (ret == 0)
@@ -435,7 +530,9 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)
 	} __packed key;
 	struct afs_vnode_cache_aux aux;
 
-	if (vnode->status.type != AFS_FTYPE_FILE) {
+	if (vnode->status.type != AFS_FTYPE_FILE &&
+	    vnode->status.type != AFS_FTYPE_DIR &&
+	    vnode->status.type != AFS_FTYPE_SYMLINK) {
 		vnode->netfs.cache = NULL;
 		return;
 	}
@@ -482,7 +579,7 @@ struct inode *afs_iget(struct afs_operation *op, struct afs_vnode_param *vp)
 	       inode, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
 
 	/* deal with an existing inode */
-	if (!(inode->i_state & I_NEW)) {
+	if (!(inode_state_read_once(inode) & I_NEW)) {
 		_leave(" = %p", inode);
 		return inode;
 	}
@@ -512,7 +609,7 @@ static int afs_iget5_set_root(struct inode *inode, void *opaque)
 	struct afs_vnode *vnode = AFS_FS_I(inode);
 
 	vnode->volume		= as->volume;
-	vnode->fid.vid		= as->volume->vid,
+	vnode->fid.vid		= as->volume->vid;
 	vnode->fid.vnode	= 1;
 	vnode->fid.unique	= 1;
 	inode->i_ino		= 1;
@@ -542,10 +639,10 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key)
 
 	_debug("GOT ROOT INODE %p { vl=%llx }", inode, as->volume->vid);
 
-	BUG_ON(!(inode->i_state & I_NEW));
+	BUG_ON(!(inode_state_read_once(inode) & I_NEW));
 
 	vnode = AFS_FS_I(inode);
-	vnode->cb_v_check = atomic_read(&as->volume->cb_v_break),
+	vnode->cb_v_check = atomic_read(&as->volume->cb_v_break);
 	afs_set_netfs_context(vnode);
 
 	op = afs_alloc_operation(key, as->volume);
@@ -626,9 +723,9 @@ int afs_drop_inode(struct inode *inode)
 	_enter("");
 
 	if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags))
-		return generic_delete_inode(inode);
+		return inode_just_drop(inode);
 	else
-		return generic_drop_inode(inode);
+		return inode_generic_drop(inode);
 }
 
 /*
@@ -637,6 +734,7 @@ int afs_drop_inode(struct inode *inode)
 void afs_evict_inode(struct inode *inode)
 {
 	struct afs_vnode_cache_aux aux;
+	struct afs_super_info *sbi = AFS_FS_S(inode->i_sb);
 	struct afs_vnode *vnode = AFS_FS_I(inode);
 
 	_enter("{%llx:%llu.%d}",
@@ -648,7 +746,22 @@ void afs_evict_inode(struct inode *inode)
 
 	ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
 
+	if ((S_ISDIR(inode->i_mode) ||
+	     S_ISLNK(inode->i_mode)) &&
+	    (inode_state_read_once(inode) & I_DIRTY) &&
+	    !sbi->dyn_root) {
+		struct writeback_control wbc = {
+			.sync_mode = WB_SYNC_ALL,
+			.for_sync = true,
+			.range_end = LLONG_MAX,
+		};
+
+		afs_single_writepages(inode->i_mapping, &wbc);
+	}
+
+	netfs_wait_for_outstanding_io(inode);
 	truncate_inode_pages_final(&inode->i_data);
+	netfs_free_folioq_buffer(vnode->directory);
 
 	afs_set_cache_aux(vnode, &aux);
 	netfs_clear_inode_writeback(inode, &aux);
@@ -694,13 +807,18 @@ static void afs_setattr_edit_file(struct afs_operation *op)
 {
 	struct afs_vnode_param *vp = &op->file[0];
 	struct afs_vnode *vnode = vp->vnode;
+	struct inode *inode = &vnode->netfs.inode;
 
 	if (op->setattr.attr->ia_valid & ATTR_SIZE) {
 		loff_t size = op->setattr.attr->ia_size;
-		loff_t i_size = op->setattr.old_i_size;
+		loff_t old = op->setattr.old_i_size;
+
+		/* Note: inode->i_size was updated by afs_apply_status() inside
+		 * the I/O and callback locks.
+		 */
 
-		if (size != i_size) {
-			truncate_setsize(&vnode->netfs.inode, size);
+		if (size != old) {
+			truncate_pagecache(inode, size);
 			netfs_resize_file(&vnode->netfs, size, true);
 			fscache_resize_cookie(afs_vnode_cache(vnode), size);
 		}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 6e1d3c4daf72..009064b8d661 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -20,6 +20,7 @@
 #include <linux/uuid.h>
 #include <linux/mm_types.h>
 #include <linux/dns_resolver.h>
+#include <crypto/krb5.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 #include <net/sock.h>
@@ -130,6 +131,7 @@ struct afs_call {
 	wait_queue_head_t	waitq;		/* processes awaiting completion */
 	struct work_struct	async_work;	/* async I/O processor */
 	struct work_struct	work;		/* actual work processor */
+	struct work_struct	free_work;	/* Deferred free processor */
 	struct rxrpc_call	*rxcall;	/* RxRPC call handle */
 	struct rxrpc_peer	*peer;		/* Remote endpoint */
 	struct key		*key;		/* security for this call */
@@ -162,6 +164,7 @@ struct afs_call {
 	spinlock_t		state_lock;
 	int			error;		/* error code */
 	u32			abort_code;	/* Remote abort ID or 0 */
+	unsigned long long	remaining;	/* How much is left to receive */
 	unsigned int		max_lifespan;	/* Maximum lifespan in secs to set if not 0 */
 	unsigned		request_size;	/* size of request data */
 	unsigned		reply_max;	/* maximum size of reply */
@@ -174,8 +177,10 @@ struct afs_call {
 	bool			intr;		/* T if interruptible */
 	bool			unmarshalling_error; /* T if an unmarshalling error occurred */
 	bool			responded;	/* Got a response from the call (may be abort) */
+	u8			security_ix;	/* Security class */
 	u16			service_id;	/* Actual service ID (after upgrade) */
 	unsigned int		debug_id;	/* Trace ID */
+	u32			enctype;	/* Security encoding type */
 	u32			operation_ID;	/* operation ID for an incoming call */
 	u32			count;		/* count for use in unmarshalling */
 	union {					/* place to extract temporary data */
@@ -200,11 +205,17 @@ struct afs_call_type {
 	/* clean up a call */
 	void (*destructor)(struct afs_call *call);
 
+	/* Async receive processing function */
+	void (*async_rx)(struct work_struct *work);
+
 	/* Work function */
 	void (*work)(struct work_struct *work);
 
 	/* Call done function (gets called immediately on success or failure) */
 	void (*done)(struct afs_call *call);
+
+	/* Handle a call being immediately cancelled. */
+	void (*immediate_cancel)(struct afs_call *call);
 };
 
 /*
@@ -232,28 +243,6 @@ static inline struct key *afs_file_key(struct file *file)
 }
 
 /*
- * Record of an outstanding read operation on a vnode.
- */
-struct afs_read {
-	loff_t			pos;		/* Where to start reading */
-	loff_t			len;		/* How much we're asking for */
-	loff_t			actual_len;	/* How much we're actually getting */
-	loff_t			file_size;	/* File size returned by server */
-	struct key		*key;		/* The key to use to reissue the read */
-	struct afs_vnode	*vnode;		/* The file being read into. */
-	struct netfs_io_subrequest *subreq;	/* Fscache helper read request this belongs to */
-	afs_dataversion_t	data_version;	/* Version number returned by server */
-	refcount_t		usage;
-	unsigned int		call_debug_id;
-	unsigned int		nr_pages;
-	int			error;
-	void (*done)(struct afs_read *);
-	void (*cleanup)(struct afs_read *);
-	struct iov_iter		*iter;		/* Iterator representing the buffer */
-	struct iov_iter		def_iter;	/* Default iterator */
-};
-
-/*
  * AFS superblock private data
  * - there's one superblock per volume
  */
@@ -295,15 +284,15 @@ struct afs_net {
 	struct socket		*socket;
 	struct afs_call		*spare_incoming_call;
 	struct work_struct	charge_preallocation_work;
+	struct work_struct	rx_oob_work;
 	struct mutex		socket_mutex;
 	atomic_t		nr_outstanding_calls;
 	atomic_t		nr_superblocks;
 
 	/* Cell database */
 	struct rb_root		cells;
-	struct afs_cell		*ws_cell;
-	struct work_struct	cells_manager;
-	struct timer_list	cells_timer;
+	struct idr		cells_dyn_ino;	/* cell->dynroot_ino mapping */
+	struct afs_cell __rcu	*ws_cell;
 	atomic_t		cells_outstanding;
 	struct rw_semaphore	cells_lock;
 	struct mutex		cells_alias_lock;
@@ -315,18 +304,12 @@ struct afs_net {
 	 * cell, but in practice, people create aliases and subsets and there's
 	 * no easy way to distinguish them.
 	 */
-	seqlock_t		fs_lock;	/* For fs_servers, fs_probe_*, fs_proc */
-	struct rb_root		fs_servers;	/* afs_server (by server UUID or address) */
+	seqlock_t		fs_lock;	/* For fs_probe_*, fs_proc */
 	struct list_head	fs_probe_fast;	/* List of afs_server to probe at 30s intervals */
 	struct list_head	fs_probe_slow;	/* List of afs_server to probe at 5m intervals */
 	struct hlist_head	fs_proc;	/* procfs servers list */
 
-	struct hlist_head	fs_addresses;	/* afs_server (by lowest IPv6 addr) */
-	seqlock_t		fs_addr_lock;	/* For fs_addresses[46] */
-
-	struct work_struct	fs_manager;
-	struct timer_list	fs_timer;
-
+	struct key		*fs_cm_token_key; /* Key for creating CM tokens */
 	struct work_struct	fs_prober;
 	struct timer_list	fs_probe_timer;
 	atomic_t		servers_outstanding;
@@ -359,13 +342,11 @@ struct afs_net {
 extern const char afs_init_sysname[];
 
 enum afs_cell_state {
-	AFS_CELL_UNSET,
-	AFS_CELL_ACTIVATING,
+	AFS_CELL_SETTING_UP,
+	AFS_CELL_UNLOOKED,
 	AFS_CELL_ACTIVE,
-	AFS_CELL_DEACTIVATING,
-	AFS_CELL_INACTIVE,
-	AFS_CELL_FAILED,
-	AFS_CELL_REMOVED,
+	AFS_CELL_REMOVING,
+	AFS_CELL_DEAD,
 };
 
 /*
@@ -396,7 +377,9 @@ struct afs_cell {
 	struct afs_cell		*alias_of;	/* The cell this is an alias of */
 	struct afs_volume	*root_volume;	/* The root.cell volume if there is one */
 	struct key		*anonymous_key;	/* anonymous user key for this cell */
+	struct work_struct	destroyer;	/* Destroyer for cell */
 	struct work_struct	manager;	/* Manager for init/deinit/dns */
+	struct timer_list	management_timer; /* General management timer */
 	struct hlist_node	proc_link;	/* /proc cell list link */
 	time64_t		dns_expiry;	/* Time AFSDB/SRV record expires */
 	time64_t		last_inactive;	/* Time of last drop of usage count */
@@ -412,6 +395,7 @@ struct afs_cell {
 	enum dns_lookup_status	dns_status:8;	/* Latest status of data from lookup */
 	unsigned int		dns_lookup_count; /* Counter of DNS lookups */
 	unsigned int		debug_id;
+	unsigned int		dynroot_ino;	/* Inode numbers for dynroot (a pair) */
 
 	/* The volumes belonging to this cell */
 	struct rw_semaphore	vs_lock;	/* Lock for server->volumes */
@@ -421,7 +405,7 @@ struct afs_cell {
 
 	/* Active fileserver interaction state. */
 	struct rb_root		fs_servers;	/* afs_server (by server UUID) */
-	seqlock_t		fs_lock;	/* For fs_servers  */
+	struct rw_semaphore	fs_lock;	/* For fs_servers  */
 
 	/* VL server list. */
 	rwlock_t		vl_servers_lock; /* Lock on vl_servers */
@@ -429,6 +413,7 @@ struct afs_cell {
 
 	u8			name_len;	/* Length of name */
 	char			*name;		/* Cell name, case-flattened and NUL-padded */
+	char			*key_desc;	/* Authentication key description */
 };
 
 /*
@@ -556,31 +541,35 @@ struct afs_server {
 	};
 
 	struct afs_cell		*cell;		/* Cell to which belongs (pins ref) */
-	struct rb_node		uuid_rb;	/* Link in net->fs_servers */
-	struct afs_server __rcu	*uuid_next;	/* Next server with same UUID */
-	struct afs_server	*uuid_prev;	/* Previous server with same UUID */
-	struct list_head	probe_link;	/* Link in net->fs_probe_list */
-	struct hlist_node	addr_link;	/* Link in net->fs_addresses6 */
+	struct rb_node		uuid_rb;	/* Link in cell->fs_servers */
+	struct list_head	probe_link;	/* Link in net->fs_probe_* */
 	struct hlist_node	proc_link;	/* Link in net->fs_proc */
 	struct list_head	volumes;	/* RCU list of afs_server_entry objects */
-	struct afs_server	*gc_next;	/* Next server in manager's list */
+	struct work_struct	destroyer;	/* Work item to try and destroy a server */
+	struct timer_list	timer;		/* Management timer */
+	struct mutex		cm_token_lock;	/* Lock governing creation of appdata */
+	struct krb5_buffer	cm_rxgk_appdata; /* Appdata to be included in RESPONSE packet */
 	time64_t		unuse_time;	/* Time at which last unused */
 	unsigned long		flags;
 #define AFS_SERVER_FL_RESPONDING 0		/* The server is responding */
 #define AFS_SERVER_FL_UPDATING	1
 #define AFS_SERVER_FL_NEEDS_UPDATE 2		/* Fileserver address list is out of date */
-#define AFS_SERVER_FL_NOT_READY	4		/* The record is not ready for use */
-#define AFS_SERVER_FL_NOT_FOUND	5		/* VL server says no such server */
-#define AFS_SERVER_FL_VL_FAIL	6		/* Failed to access VL server */
+#define AFS_SERVER_FL_UNCREATED	3		/* The record needs creating */
+#define AFS_SERVER_FL_CREATING	4		/* The record is being created */
+#define AFS_SERVER_FL_EXPIRED	5		/* The record has expired */
+#define AFS_SERVER_FL_NOT_FOUND	6		/* VL server says no such server */
+#define AFS_SERVER_FL_VL_FAIL	7		/* Failed to access VL server */
 #define AFS_SERVER_FL_MAY_HAVE_CB 8		/* May have callbacks on this fileserver */
 #define AFS_SERVER_FL_IS_YFS	16		/* Server is YFS not AFS */
 #define AFS_SERVER_FL_NO_IBULK	17		/* Fileserver doesn't support FS.InlineBulkStatus */
 #define AFS_SERVER_FL_NO_RM2	18		/* Fileserver doesn't support YFS.RemoveFile2 */
 #define AFS_SERVER_FL_HAS_FS64	19		/* Fileserver supports FS.{Fetch,Store}Data64 */
+#define AFS_SERVER_FL_NO_RENAME2 20		/* YFS Fileserver doesn't support enhanced rename */
 	refcount_t		ref;		/* Object refcount */
 	atomic_t		active;		/* Active user count */
 	u32			addr_version;	/* Address list version */
 	u16			service_id;	/* Service ID we're using. */
+	short			create_error;	/* Creation error */
 	unsigned int		rtt;		/* Server's current RTT in uS */
 	unsigned int		debug_id;	/* Debugging ID for traces */
 
@@ -635,6 +624,7 @@ struct afs_volume {
 	afs_volid_t		vid;		/* The volume ID of this volume */
 	afs_volid_t		vids[AFS_MAXTYPES]; /* All associated volume IDs */
 	refcount_t		ref;
+	unsigned int		debug_id;	/* Debugging ID for traces */
 	time64_t		update_at;	/* Time at which to next update */
 	struct afs_cell		*cell;		/* Cell to which belongs (pins ref) */
 	struct rb_node		cell_node;	/* Link in cell->volumes */
@@ -701,24 +691,26 @@ struct afs_vnode {
 	struct afs_file_status	status;		/* AFS status info for this file */
 	afs_dataversion_t	invalid_before;	/* Child dentries are invalid before this */
 	struct afs_permits __rcu *permit_cache;	/* cache of permits so far obtained */
-	struct mutex		io_lock;	/* Lock for serialising I/O on this mutex */
+	struct list_head	io_lock_waiters; /* Threads waiting for the I/O lock */
 	struct rw_semaphore	validate_lock;	/* lock for validating this vnode */
 	struct rw_semaphore	rmdir_lock;	/* Lock for rmdir vs sillyrename */
 	struct key		*silly_key;	/* Silly rename key */
 	spinlock_t		wb_lock;	/* lock for wb_keys */
 	spinlock_t		lock;		/* waitqueue/flags lock */
 	unsigned long		flags;
+#define AFS_VNODE_IO_LOCK	0		/* Set if the I/O serialisation lock is held */
 #define AFS_VNODE_UNSET		1		/* set if vnode attributes not yet set */
 #define AFS_VNODE_DIR_VALID	2		/* Set if dir contents are valid */
 #define AFS_VNODE_ZAP_DATA	3		/* set if vnode's data should be invalidated */
 #define AFS_VNODE_DELETED	4		/* set if vnode deleted on server */
 #define AFS_VNODE_MOUNTPOINT	5		/* set if vnode is a mountpoint symlink */
-#define AFS_VNODE_AUTOCELL	6		/* set if Vnode is an auto mount point */
 #define AFS_VNODE_PSEUDODIR	7 		/* set if Vnode is a pseudo directory */
 #define AFS_VNODE_NEW_CONTENT	8		/* Set if file has new content (create/trunc-0) */
 #define AFS_VNODE_SILLY_DELETED	9		/* Set if file has been silly-deleted */
 #define AFS_VNODE_MODIFYING	10		/* Set if we're performing a modification op */
+#define AFS_VNODE_DIR_READ	11		/* Set if we've read a dir's contents */
 
+	struct folio_queue	*directory;	/* Directory contents */
 	struct list_head	wb_keys;	/* List of keys available for writeback */
 	struct list_head	pending_locks;	/* locks waiting to be granted */
 	struct list_head	granted_locks;	/* locks granted on this file */
@@ -727,6 +719,7 @@ struct afs_vnode {
 	ktime_t			locked_at;	/* Time at which lock obtained */
 	enum afs_lock_state	lock_state : 8;
 	afs_lock_type_t		lock_type : 8;
+	unsigned int		directory_size;	/* Amount of space in ->directory */
 
 	/* outstanding callback notification on this file */
 	struct work_struct	cb_work;	/* Work for mmap'd files */
@@ -901,12 +894,13 @@ struct afs_operation {
 			bool	need_rehash;
 		} unlink;
 		struct {
-			struct dentry *rehash;
-			struct dentry *tmp;
-			bool	new_negative;
+			struct dentry	*rehash;
+			struct dentry	*tmp;
+			unsigned int	rename_flags;
+			bool		new_negative;
 		} rename;
 		struct {
-			struct afs_read *req;
+			struct netfs_io_subrequest *subreq;
 		} fetch;
 		struct {
 			afs_lock_type_t type;
@@ -958,6 +952,7 @@ struct afs_operation {
 #define AFS_OPERATION_TRIED_ALL		0x0400	/* Set if we've tried all the fileservers */
 #define AFS_OPERATION_RETRY_SERVER	0x0800	/* Set if we should retry the current server */
 #define AFS_OPERATION_DIR_CONFLICT	0x1000	/* Set if we detected a 3rd-party dir change */
+#define AFS_OPERATION_ASYNC		0x2000	/* Set if should run asynchronously */
 };
 
 /*
@@ -982,6 +977,21 @@ static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int fl
 			   i_size_read(&vnode->netfs.inode), flags);
 }
 
+/*
+ * Directory iteration management.
+ */
+struct afs_dir_iter {
+	struct afs_vnode	*dvnode;
+	union afs_xdr_dir_block *block;
+	struct folio_queue	*fq;
+	unsigned int		fpos;
+	int			fq_slot;
+	unsigned int		loop_check;
+	u8			nr_slots;
+	u8			bucket;
+	unsigned int		prev_entry;
+};
+
 #include <trace/events/afs.h>
 
 /*****************************************************************************/
@@ -1002,6 +1012,9 @@ extern int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *addr,
 			      __be32 xdr, u16 port);
 extern int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *addr,
 			      __be32 *xdr, u16 port);
+void afs_set_peer_appdata(struct afs_server *server,
+			  struct afs_addr_list *old_alist,
+			  struct afs_addr_list *new_alist);
 
 /*
  * addr_prefs.c
@@ -1038,16 +1051,26 @@ static inline bool afs_cb_is_broken(unsigned int cb_break,
 extern int afs_cell_init(struct afs_net *, const char *);
 extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned,
 				      enum afs_cell_trace);
-extern struct afs_cell *afs_lookup_cell(struct afs_net *, const char *, unsigned,
-					const char *, bool);
+enum afs_lookup_cell_for {
+	AFS_LOOKUP_CELL_DYNROOT,
+	AFS_LOOKUP_CELL_MOUNTPOINT,
+	AFS_LOOKUP_CELL_DIRECT_MOUNT,
+	AFS_LOOKUP_CELL_PRELOAD,
+	AFS_LOOKUP_CELL_ROOTCELL,
+	AFS_LOOKUP_CELL_ALIAS_CHECK,
+};
+struct afs_cell *afs_lookup_cell(struct afs_net *net,
+				 const char *name, unsigned int namesz,
+				 const char *vllist,
+				 enum afs_lookup_cell_for reason,
+				 enum afs_cell_trace trace);
 extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace);
-extern void afs_unuse_cell(struct afs_net *, struct afs_cell *, enum afs_cell_trace);
+void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason);
 extern struct afs_cell *afs_get_cell(struct afs_cell *, enum afs_cell_trace);
 extern void afs_see_cell(struct afs_cell *, enum afs_cell_trace);
 extern void afs_put_cell(struct afs_cell *, enum afs_cell_trace);
 extern void afs_queue_cell(struct afs_cell *, enum afs_cell_trace);
-extern void afs_manage_cells(struct work_struct *);
-extern void afs_cells_timer(struct timer_list *);
+void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs);
 extern void __net_exit afs_cell_purge(struct afs_net *);
 
 /*
@@ -1056,6 +1079,19 @@ extern void __net_exit afs_cell_purge(struct afs_net *);
 extern bool afs_cm_incoming_call(struct afs_call *);
 
 /*
+ * cm_security.c
+ */
+void afs_process_oob_queue(struct work_struct *work);
+#ifdef CONFIG_RXGK
+int afs_create_token_key(struct afs_net *net, struct socket *socket);
+#else
+static inline int afs_create_token_key(struct afs_net *net, struct socket *socket)
+{
+	return 0;
+}
+#endif
+
+/*
  * dir.c
  */
 extern const struct file_operations afs_dir_file_operations;
@@ -1063,15 +1099,34 @@ extern const struct inode_operations afs_dir_inode_operations;
 extern const struct address_space_operations afs_dir_aops;
 extern const struct dentry_operations afs_fs_dentry_operations;
 
+ssize_t afs_read_single(struct afs_vnode *dvnode, struct file *file);
+ssize_t afs_read_dir(struct afs_vnode *dvnode, struct file *file)
+	__acquires(&dvnode->validate_lock);
 extern void afs_d_release(struct dentry *);
 extern void afs_check_for_remote_deletion(struct afs_operation *);
+int afs_single_writepages(struct address_space *mapping,
+			  struct writeback_control *wbc);
 
 /*
  * dir_edit.c
  */
-extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *,
+extern void afs_edit_dir_add(struct afs_vnode *, const struct qstr *, struct afs_fid *,
 			     enum afs_edit_dir_reason);
-extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason);
+extern void afs_edit_dir_remove(struct afs_vnode *, const struct qstr *, enum afs_edit_dir_reason);
+void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name,
+			 struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why);
+void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_vnode);
+
+/*
+ * dir_search.c
+ */
+unsigned int afs_dir_hash_name(const struct qstr *name);
+bool afs_dir_init_iter(struct afs_dir_iter *iter, const struct qstr *name);
+union afs_xdr_dir_block *afs_dir_find_block(struct afs_dir_iter *iter, size_t block);
+int afs_dir_search_bucket(struct afs_dir_iter *iter, const struct qstr *name,
+			  struct afs_fid *_fid);
+int afs_dir_search(struct afs_vnode *dvnode, const struct qstr *name,
+		   struct afs_fid *_fid, afs_dataversion_t *_dir_version);
 
 /*
  * dir_silly.c
@@ -1086,34 +1141,23 @@ extern int afs_silly_iput(struct dentry *, struct inode *);
 extern const struct inode_operations afs_dynroot_inode_operations;
 extern const struct dentry_operations afs_dynroot_dentry_operations;
 
-extern struct inode *afs_try_auto_mntpt(struct dentry *, struct inode *);
-extern int afs_dynroot_mkdir(struct afs_net *, struct afs_cell *);
-extern void afs_dynroot_rmdir(struct afs_net *, struct afs_cell *);
-extern int afs_dynroot_populate(struct super_block *);
-extern void afs_dynroot_depopulate(struct super_block *);
+struct inode *afs_dynroot_iget_root(struct super_block *sb);
 
 /*
  * file.c
  */
 extern const struct address_space_operations afs_file_aops;
-extern const struct address_space_operations afs_symlink_aops;
 extern const struct inode_operations afs_file_inode_operations;
 extern const struct file_operations afs_file_operations;
+extern const struct afs_operation_ops afs_fetch_data_operation;
 extern const struct netfs_request_ops afs_req_ops;
 
 extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *);
 extern void afs_put_wb_key(struct afs_wb_key *);
 extern int afs_open(struct inode *, struct file *);
 extern int afs_release(struct inode *, struct file *);
-extern int afs_fetch_data(struct afs_vnode *, struct afs_read *);
-extern struct afs_read *afs_alloc_read(gfp_t);
-extern void afs_put_read(struct afs_read *);
-
-static inline struct afs_read *afs_get_read(struct afs_read *req)
-{
-	refcount_inc(&req->usage);
-	return req;
-}
+void afs_fetch_data_async_rx(struct work_struct *work);
+void afs_fetch_data_immediate_cancel(struct afs_call *call);
 
 /*
  * flock.c
@@ -1165,6 +1209,7 @@ extern void afs_fs_store_acl(struct afs_operation *);
 extern struct afs_operation *afs_alloc_operation(struct key *, struct afs_volume *);
 extern int afs_put_operation(struct afs_operation *);
 extern bool afs_begin_vnode_operation(struct afs_operation *);
+extern void afs_end_vnode_operation(struct afs_operation *op);
 extern void afs_wait_for_operation(struct afs_operation *);
 extern int afs_do_sync_operation(struct afs_operation *);
 
@@ -1188,8 +1233,8 @@ struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *est
 						  enum afs_estate_trace where);
 void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where);
 extern void afs_fileserver_probe_result(struct afs_call *);
-void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
-			     struct afs_addr_list *new_addrs, struct key *key);
+int afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
+			    struct afs_addr_list *new_alist, struct key *key);
 int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr);
 extern void afs_probe_fileserver(struct afs_net *, struct afs_server *);
 extern void afs_fs_probe_dispatcher(struct work_struct *);
@@ -1202,10 +1247,13 @@ extern void afs_fs_probe_cleanup(struct afs_net *);
  */
 extern const struct afs_operation_ops afs_fetch_status_operation;
 
+void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op);
+const char *afs_get_link(struct dentry *dentry, struct inode *inode,
+			 struct delayed_call *callback);
+int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen);
 extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *);
 extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *);
 extern int afs_ilookup5_test_by_fid(struct inode *, void *);
-extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool);
 extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *);
 extern struct inode *afs_root_iget(struct super_block *, struct key *);
 extern int afs_getattr(struct mnt_idmap *idmap, const struct path *,
@@ -1331,7 +1379,9 @@ extern int __net_init afs_open_socket(struct afs_net *);
 extern void __net_exit afs_close_socket(struct afs_net *);
 extern void afs_charge_preallocation(struct work_struct *);
 extern void afs_put_call(struct afs_call *);
+void afs_deferred_put_call(struct afs_call *call);
 void afs_make_call(struct afs_call *call, gfp_t gfp);
+void afs_deliver_to_call(struct afs_call *call);
 void afs_wait_for_call_to_complete(struct afs_call *call);
 extern struct afs_call *afs_alloc_flat_call(struct afs_net *,
 					    const struct afs_call_type *,
@@ -1342,6 +1392,28 @@ extern void afs_send_simple_reply(struct afs_call *, const void *, size_t);
 extern int afs_extract_data(struct afs_call *, bool);
 extern int afs_protocol_error(struct afs_call *, enum afs_eproto_cause);
 
+static inline struct afs_call *afs_get_call(struct afs_call *call,
+					    enum afs_call_trace why)
+{
+	int r;
+
+	__refcount_inc(&call->ref, &r);
+
+	trace_afs_call(call->debug_id, why, r + 1,
+		       atomic_read(&call->net->nr_outstanding_calls),
+		       __builtin_return_address(0));
+	return call;
+}
+
+static inline void afs_see_call(struct afs_call *call, enum afs_call_trace why)
+{
+	int r = refcount_read(&call->ref);
+
+	trace_afs_call(call->debug_id, why, r,
+		       atomic_read(&call->net->nr_outstanding_calls),
+		       __builtin_return_address(0));
+}
+
 static inline void afs_make_op_call(struct afs_operation *op, struct afs_call *call,
 				    gfp_t gfp)
 {
@@ -1463,20 +1535,30 @@ extern void __exit afs_clean_up_permit_cache(void);
  */
 extern spinlock_t afs_server_peer_lock;
 
-extern struct afs_server *afs_find_server(struct afs_net *, const struct rxrpc_peer *);
-extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *);
+struct afs_server *afs_find_server(const struct rxrpc_peer *peer);
 extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *, u32);
 extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace);
-extern struct afs_server *afs_use_server(struct afs_server *, enum afs_server_trace);
-extern void afs_unuse_server(struct afs_net *, struct afs_server *, enum afs_server_trace);
-extern void afs_unuse_server_notime(struct afs_net *, struct afs_server *, enum afs_server_trace);
+struct afs_server *afs_use_server(struct afs_server *server, bool activate,
+				  enum afs_server_trace reason);
+void afs_unuse_server(struct afs_net *net, struct afs_server *server,
+		      enum afs_server_trace reason);
+void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server,
+			     enum afs_server_trace reason);
 extern void afs_put_server(struct afs_net *, struct afs_server *, enum afs_server_trace);
-extern void afs_manage_servers(struct work_struct *);
-extern void afs_servers_timer(struct timer_list *);
+void afs_purge_servers(struct afs_cell *cell);
 extern void afs_fs_probe_timer(struct timer_list *);
-extern void __net_exit afs_purge_servers(struct afs_net *);
+void __net_exit afs_wait_for_servers(struct afs_net *net);
 bool afs_check_server_record(struct afs_operation *op, struct afs_server *server, struct key *key);
 
+static inline void afs_see_server(struct afs_server *server, enum afs_server_trace trace)
+{
+	int r = refcount_read(&server->ref);
+	int a = atomic_read(&server->active);
+
+	trace_afs_server(server->debug_id, r, a, trace);
+
+}
+
 static inline void afs_inc_servers_outstanding(struct afs_net *net)
 {
 	atomic_inc(&net->servers_outstanding);
@@ -1624,6 +1706,9 @@ extern void yfs_fs_remove_dir(struct afs_operation *);
 extern void yfs_fs_link(struct afs_operation *);
 extern void yfs_fs_symlink(struct afs_operation *);
 extern void yfs_fs_rename(struct afs_operation *);
+void yfs_fs_rename_replace(struct afs_operation *op);
+void yfs_fs_rename_noreplace(struct afs_operation *op);
+void yfs_fs_rename_exchange(struct afs_operation *op);
 extern void yfs_fs_store_data(struct afs_operation *);
 extern void yfs_fs_setattr(struct afs_operation *);
 extern void yfs_fs_get_volume_status(struct afs_operation *);
@@ -1708,6 +1793,38 @@ static inline int afs_bad(struct afs_vnode *vnode, enum afs_file_error where)
 	return -EIO;
 }
 
+/*
+ * Set the callback promise on a vnode.
+ */
+static inline void afs_set_cb_promise(struct afs_vnode *vnode, time64_t expires_at,
+				      enum afs_cb_promise_trace trace)
+{
+	atomic64_set(&vnode->cb_expires_at, expires_at);
+	trace_afs_cb_promise(vnode, trace);
+}
+
+/*
+ * Clear the callback promise on a vnode, returning true if it was promised.
+ */
+static inline bool afs_clear_cb_promise(struct afs_vnode *vnode,
+					enum afs_cb_promise_trace trace)
+{
+	trace_afs_cb_promise(vnode, trace);
+	return atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE;
+}
+
+/*
+ * Mark a directory as being invalid.
+ */
+static inline void afs_invalidate_dir(struct afs_vnode *dvnode,
+				      enum afs_dir_invalid_trace trace)
+{
+	if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) {
+		trace_afs_dir_invalid(dvnode, trace);
+		afs_stat_v(dvnode, n_inval);
+	}
+}
+
 /*****************************************************************************/
 /*
  * debug tracing
diff --git a/fs/afs/main.c b/fs/afs/main.c
index a14f6013e316..e6bb8237db98 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -73,28 +73,21 @@ static int __net_init afs_net_init(struct net *net_ns)
 	generate_random_uuid((unsigned char *)&net->uuid);
 
 	INIT_WORK(&net->charge_preallocation_work, afs_charge_preallocation);
+	INIT_WORK(&net->rx_oob_work, afs_process_oob_queue);
 	mutex_init(&net->socket_mutex);
 
 	net->cells = RB_ROOT;
+	idr_init(&net->cells_dyn_ino);
 	init_rwsem(&net->cells_lock);
-	INIT_WORK(&net->cells_manager, afs_manage_cells);
-	timer_setup(&net->cells_timer, afs_cells_timer, 0);
-
 	mutex_init(&net->cells_alias_lock);
 	mutex_init(&net->proc_cells_lock);
 	INIT_HLIST_HEAD(&net->proc_cells);
 
 	seqlock_init(&net->fs_lock);
-	net->fs_servers = RB_ROOT;
 	INIT_LIST_HEAD(&net->fs_probe_fast);
 	INIT_LIST_HEAD(&net->fs_probe_slow);
 	INIT_HLIST_HEAD(&net->fs_proc);
 
-	INIT_HLIST_HEAD(&net->fs_addresses);
-	seqlock_init(&net->fs_addr_lock);
-
-	INIT_WORK(&net->fs_manager, afs_manage_servers);
-	timer_setup(&net->fs_timer, afs_servers_timer, 0);
 	INIT_WORK(&net->fs_prober, afs_fs_probe_dispatcher);
 	timer_setup(&net->fs_probe_timer, afs_fs_probe_timer, 0);
 	atomic_set(&net->servers_outstanding, 1);
@@ -130,13 +123,14 @@ error_open_socket:
 	net->live = false;
 	afs_fs_probe_cleanup(net);
 	afs_cell_purge(net);
-	afs_purge_servers(net);
+	afs_wait_for_servers(net);
 error_cell_init:
 	net->live = false;
 	afs_proc_cleanup(net);
 error_proc:
 	afs_put_sysnames(net->sysnames);
 error_sysnames:
+	idr_destroy(&net->cells_dyn_ino);
 	net->live = false;
 	return ret;
 }
@@ -151,10 +145,11 @@ static void __net_exit afs_net_exit(struct net *net_ns)
 	net->live = false;
 	afs_fs_probe_cleanup(net);
 	afs_cell_purge(net);
-	afs_purge_servers(net);
+	afs_wait_for_servers(net);
 	afs_close_socket(net);
 	afs_proc_cleanup(net);
 	afs_put_sysnames(net->sysnames);
+	idr_destroy(&net->cells_dyn_ino);
 	kfree_rcu(rcu_access_pointer(net->address_prefs), rcu);
 }
 
@@ -174,13 +169,13 @@ static int __init afs_init(void)
 
 	printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n");
 
-	afs_wq = alloc_workqueue("afs", 0, 0);
+	afs_wq = alloc_workqueue("afs", WQ_PERCPU, 0);
 	if (!afs_wq)
 		goto error_afs_wq;
-	afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM, 0);
+	afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
 	if (!afs_async_calls)
 		goto error_async;
-	afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM, 0);
+	afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM | WQ_PERCPU, 0);
 	if (!afs_lock_manager)
 		goto error_lockmgr;
 
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index b8180bf2281f..c8a7f266080d 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -8,6 +8,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/errno.h>
+#include <crypto/krb5.h>
 #include "internal.h"
 #include "afs_fs.h"
 #include "protocol_uae.h"
@@ -103,7 +104,34 @@ int afs_abort_to_error(u32 abort_code)
 	case RXKADDATALEN:	return -EKEYREJECTED;
 	case RXKADILLEGALLEVEL:	return -EKEYREJECTED;
 
+	case RXGK_INCONSISTENCY:	return -EPROTO;
+	case RXGK_PACKETSHORT:		return -EPROTO;
+	case RXGK_BADCHALLENGE:		return -EPROTO;
+	case RXGK_SEALEDINCON:		return -EKEYREJECTED;
+	case RXGK_NOTAUTH:		return -EKEYREJECTED;
+	case RXGK_EXPIRED:		return -EKEYEXPIRED;
+	case RXGK_BADLEVEL:		return -EKEYREJECTED;
+	case RXGK_BADKEYNO:		return -EKEYREJECTED;
+	case RXGK_NOTRXGK:		return -EKEYREJECTED;
+	case RXGK_UNSUPPORTED:		return -EKEYREJECTED;
+	case RXGK_GSSERROR:		return -EKEYREJECTED;
+#ifdef RXGK_BADETYPE
+	case RXGK_BADETYPE:		return -ENOPKG;
+#endif
+#ifdef RXGK_BADTOKEN
+	case RXGK_BADTOKEN:		return -EKEYREJECTED;
+#endif
+#ifdef RXGK_BADETYPE
+	case RXGK_DATALEN:		return -EPROTO;
+#endif
+#ifdef RXGK_BADQOP
+	case RXGK_BADQOP:		return -EKEYREJECTED;
+#endif
+
+	case KRB5_PROG_KEYTYPE_NOSUPP:	return -ENOPKG;
+
 	case RXGEN_OPCODE:	return -ENOTSUPP;
+	case RX_INVALID_OPERATION:	return -ENOTSUPP;
 
 	default:		return -EREMOTEIO;
 	}
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 97f50e9fd9eb..57c204a3c04e 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -30,7 +30,7 @@ const struct file_operations afs_mntpt_file_operations = {
 
 const struct inode_operations afs_mntpt_inode_operations = {
 	.lookup		= afs_mntpt_lookup,
-	.readlink	= page_readlink,
+	.readlink	= afs_readlink,
 	.getattr	= afs_getattr,
 };
 
@@ -87,7 +87,7 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
 		ctx->force = true;
 	}
 	if (ctx->cell) {
-		afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_mntpt);
+		afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_mntpt);
 		ctx->cell = NULL;
 	}
 	if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) {
@@ -107,7 +107,9 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
 		if (size > AFS_MAXCELLNAME)
 			return -ENAMETOOLONG;
 
-		cell = afs_lookup_cell(ctx->net, p, size, NULL, false);
+		cell = afs_lookup_cell(ctx->net, p, size, NULL,
+				       AFS_LOOKUP_CELL_MOUNTPOINT,
+				       afs_cell_trace_use_lookup_mntpt);
 		if (IS_ERR(cell)) {
 			pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt);
 			return PTR_ERR(cell);
@@ -118,9 +120,9 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
 		ctx->volnamesz = sizeof(afs_root_volume) - 1;
 	} else {
 		/* read the contents of the AFS special symlink */
-		struct page *page;
+		DEFINE_DELAYED_CALL(cleanup);
+		const char *content;
 		loff_t size = i_size_read(d_inode(mntpt));
-		char *buf;
 
 		if (src_as->cell)
 			ctx->cell = afs_use_cell(src_as->cell, afs_cell_trace_use_mntpt);
@@ -128,18 +130,24 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
 		if (size < 2 || size > PAGE_SIZE - 1)
 			return -EINVAL;
 
-		page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL);
-		if (IS_ERR(page))
-			return PTR_ERR(page);
+		content = afs_get_link(mntpt, d_inode(mntpt), &cleanup);
+		if (IS_ERR(content)) {
+			do_delayed_call(&cleanup);
+			return PTR_ERR(content);
+		}
 
-		buf = kmap(page);
 		ret = -EINVAL;
-		if (buf[size - 1] == '.')
-			ret = vfs_parse_fs_string(fc, "source", buf, size - 1);
-		kunmap(page);
-		put_page(page);
+		if (content[size - 1] == '.')
+			ret = vfs_parse_fs_qstr(fc, "source",
+						&QSTR_LEN(content, size - 1));
+		do_delayed_call(&cleanup);
 		if (ret < 0)
 			return ret;
+
+		/* Don't cross a backup volume mountpoint from a backup volume */
+		if (src_as->volume && src_as->volume->type == AFSVL_BACKVOL &&
+		    ctx->type == AFSVL_BACKVOL)
+			return -ENODEV;
 	}
 
 	return 0;
@@ -183,7 +191,6 @@ struct vfsmount *afs_d_automount(struct path *path)
 	if (IS_ERR(newmnt))
 		return newmnt;
 
-	mntget(newmnt); /* prevent immediate expiration */
 	mnt_set_expiry(newmnt, &afs_vfsmounts);
 	queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
 			   afs_mntpt_expiry_timeout * HZ);
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 15eab053af6d..44520549b509 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -122,14 +122,16 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size)
 	if (strcmp(buf, "add") == 0) {
 		struct afs_cell *cell;
 
-		cell = afs_lookup_cell(net, name, strlen(name), args, true);
+		cell = afs_lookup_cell(net, name, strlen(name), args,
+				       AFS_LOOKUP_CELL_PRELOAD,
+				       afs_cell_trace_use_lookup_add);
 		if (IS_ERR(cell)) {
 			ret = PTR_ERR(cell);
 			goto done;
 		}
 
 		if (test_and_set_bit(AFS_CELL_FL_NO_GC, &cell->flags))
-			afs_unuse_cell(net, cell, afs_cell_trace_unuse_no_pin);
+			afs_unuse_cell(cell, afs_cell_trace_unuse_no_pin);
 	} else {
 		goto inval;
 	}
@@ -206,7 +208,7 @@ static int afs_proc_rootcell_show(struct seq_file *m, void *v)
 
 	net = afs_seq2net_single(m);
 	down_read(&net->cells_lock);
-	cell = net->ws_cell;
+	cell = rcu_dereference_protected(net->ws_cell, lockdep_is_held(&net->cells_lock));
 	if (cell)
 		seq_printf(m, "%s\n", cell->name);
 	up_read(&net->cells_lock);
@@ -240,7 +242,13 @@ static int afs_proc_rootcell_write(struct file *file, char *buf, size_t size)
 	/* determine command to perform */
 	_debug("rootcell=%s", buf);
 
-	ret = afs_cell_init(net, buf);
+	ret = -EEXIST;
+	inode_lock(file_inode(file));
+	if (!rcu_access_pointer(net->ws_cell))
+		ret = afs_cell_init(net, buf);
+	else
+		printk("busy\n");
+	inode_unlock(file_inode(file));
 
 out:
 	_leave(" = %d", ret);
@@ -437,8 +445,6 @@ static int afs_proc_servers_show(struct seq_file *m, void *v)
 	}
 
 	server = list_entry(v, struct afs_server, proc_link);
-	estate = rcu_dereference(server->endpoint_state);
-	alist = estate->addresses;
 	seq_printf(m, "%pU %3d %3d %s\n",
 		   &server->uuid,
 		   refcount_read(&server->ref),
@@ -448,10 +454,16 @@ static int afs_proc_servers_show(struct seq_file *m, void *v)
 		   server->flags, server->rtt);
 	seq_printf(m, "  - probe: last=%d\n",
 		   (int)(jiffies - server->probed_at) / HZ);
+
+	estate = rcu_dereference(server->endpoint_state);
+	if (!estate)
+		goto out;
 	failed = estate->failed_set;
 	seq_printf(m, "  - ESTATE pq=%x np=%u rsp=%lx f=%lx\n",
 		   estate->probe_seq, atomic_read(&estate->nr_probing),
 		   estate->responsive_set, estate->failed_set);
+
+	alist = estate->addresses;
 	seq_printf(m, "  - ALIST v=%u ap=%u\n",
 		   alist->version, alist->addr_pref_version);
 	for (i = 0; i < alist->nr_addrs; i++) {
@@ -464,6 +476,8 @@ static int afs_proc_servers_show(struct seq_file *m, void *v)
 			   rxrpc_kernel_get_srtt(addr->peer),
 			   addr->last_error, addr->prio);
 	}
+
+out:
 	return 0;
 }
 
diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h
index e4cd89c44c46..b2f06c1917c2 100644
--- a/fs/afs/protocol_yfs.h
+++ b/fs/afs/protocol_yfs.h
@@ -50,6 +50,9 @@ enum YFS_FS_Operations {
 	YFSREMOVEACL		= 64171,
 	YFSREMOVEFILE2		= 64173,
 	YFSSTOREOPAQUEACL2	= 64174,
+	YFSRENAME_REPLACE	= 64176,
+	YFSRENAME_NOREPLACE	= 64177,
+	YFSRENAME_EXCHANGE	= 64187,
 	YFSINLINEBULKSTATUS	= 64536, /* YFS Fetch multiple file statuses with errors */
 	YFSFETCHDATA64		= 64537, /* YFS Fetch file data */
 	YFSSTOREDATA64		= 64538, /* YFS Store file data */
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index ed09d4d4c211..6a4e7da10fc4 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -99,7 +99,7 @@ static bool afs_start_fs_iteration(struct afs_operation *op,
 		write_seqlock(&vnode->cb_lock);
 		ASSERTCMP(cb_server, ==, vnode->cb_server);
 		vnode->cb_server = NULL;
-		if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE)
+		if (afs_clear_cb_promise(vnode, afs_cb_promise_clear_rotate_server))
 			vnode->cb_break++;
 		write_sequnlock(&vnode->cb_lock);
 	}
@@ -432,6 +432,16 @@ bool afs_select_fileserver(struct afs_operation *op)
 			afs_op_set_error(op, -EDQUOT);
 			goto failed_but_online;
 
+		case RX_INVALID_OPERATION:
+		case RXGEN_OPCODE:
+			/* Handle downgrading to an older operation. */
+			afs_op_set_error(op, -ENOTSUPP);
+			if (op->flags & AFS_OPERATION_DOWNGRADE) {
+				op->flags &= ~AFS_OPERATION_DOWNGRADE;
+				goto go_again;
+			}
+			goto failed_but_online;
+
 		default:
 			afs_op_accumulate_error(op, error, abort_code);
 		failed_but_online:
@@ -583,7 +593,7 @@ selected_server:
 	if (vnode->cb_server != server) {
 		vnode->cb_server = server;
 		vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break);
-		atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
+		afs_clear_cb_promise(vnode, afs_cb_promise_clear_server_change);
 	}
 
 retry_server:
@@ -620,20 +630,23 @@ iterate_address:
 	op->addr_index = addr_index;
 	set_bit(addr_index, &op->addr_tried);
 
-	op->volsync.creation = TIME64_MIN;
-	op->volsync.update = TIME64_MIN;
-	op->call_responded = false;
 	_debug("address [%u] %u/%u %pISp",
 	       op->server_index, addr_index, alist->nr_addrs,
 	       rxrpc_kernel_remote_addr(alist->addrs[op->addr_index].peer));
+go_again:
+	op->volsync.creation = TIME64_MIN;
+	op->volsync.update = TIME64_MIN;
+	op->call_responded = false;
 	_leave(" = t");
 	return true;
 
 wait_for_more_probe_results:
 	error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried,
 					  !(op->flags & AFS_OPERATION_UNINTR));
-	if (!error)
+	if (error == 1)
 		goto iterate_address;
+	if (!error)
+		goto restart_from_beginning;
 
 	/* We've now had a failure to respond on all of a server's addresses -
 	 * immediately probe them again and consider retrying the server.
@@ -644,10 +657,13 @@ wait_for_more_probe_results:
 		error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried,
 						  !(op->flags & AFS_OPERATION_UNINTR));
 		switch (error) {
-		case 0:
+		case 1:
 			op->flags &= ~AFS_OPERATION_RETRY_SERVER;
-			trace_afs_rotate(op, afs_rotate_trace_retry_server, 0);
+			trace_afs_rotate(op, afs_rotate_trace_retry_server, 1);
 			goto retry_server;
+		case 0:
+			trace_afs_rotate(op, afs_rotate_trace_retry_server, 0);
+			goto restart_from_beginning;
 		case -ERESTARTSYS:
 			afs_op_set_error(op, error);
 			goto failed;
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index c453428f3c8b..bf0e4ea0aafd 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -18,13 +18,23 @@
 
 struct workqueue_struct *afs_async_calls;
 
+static void afs_deferred_free_worker(struct work_struct *work);
 static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long);
 static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long);
 static void afs_process_async_call(struct work_struct *);
 static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long);
 static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long);
+static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID);
+static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob);
 static int afs_deliver_cm_op_id(struct afs_call *);
 
+static const struct rxrpc_kernel_ops afs_rxrpc_callback_ops = {
+	.notify_new_call	= afs_rx_new_call,
+	.discard_new_call	= afs_rx_discard_new_call,
+	.user_attach_call	= afs_rx_attach,
+	.notify_oob		= afs_rx_notify_oob,
+};
+
 /* asynchronous incoming call initial processing */
 static const struct afs_call_type afs_RXCMxxxx = {
 	.name		= "CB.xxxx",
@@ -48,6 +58,7 @@ int afs_open_socket(struct afs_net *net)
 		goto error_1;
 
 	socket->sk->sk_allocation = GFP_NOFS;
+	socket->sk->sk_user_data = net;
 
 	/* bind the callback manager's address to make this a server socket */
 	memset(&srx, 0, sizeof(srx));
@@ -63,16 +74,24 @@ int afs_open_socket(struct afs_net *net)
 	if (ret < 0)
 		goto error_2;
 
-	ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
+	ret = rxrpc_sock_set_manage_response(socket->sk, true);
+	if (ret < 0)
+		goto error_2;
+
+	ret = afs_create_token_key(net, socket);
+	if (ret < 0)
+		pr_err("Couldn't create RxGK CM key: %d\n", ret);
+
+	ret = kernel_bind(socket, (struct sockaddr_unsized *) &srx, sizeof(srx));
 	if (ret == -EADDRINUSE) {
 		srx.transport.sin6.sin6_port = 0;
-		ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
+		ret = kernel_bind(socket, (struct sockaddr_unsized *) &srx, sizeof(srx));
 	}
 	if (ret < 0)
 		goto error_2;
 
 	srx.srx_service = YFS_CM_SERVICE;
-	ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
+	ret = kernel_bind(socket, (struct sockaddr_unsized *) &srx, sizeof(srx));
 	if (ret < 0)
 		goto error_2;
 
@@ -83,8 +102,7 @@ int afs_open_socket(struct afs_net *net)
 	 * it sends back to us.
 	 */
 
-	rxrpc_kernel_new_call_notification(socket, afs_rx_new_call,
-					   afs_rx_discard_new_call);
+	rxrpc_kernel_set_notifications(socket, &afs_rxrpc_callback_ops);
 
 	ret = kernel_listen(socket, INT_MAX);
 	if (ret < 0)
@@ -124,7 +142,9 @@ void afs_close_socket(struct afs_net *net)
 
 	kernel_sock_shutdown(net->socket, SHUT_RDWR);
 	flush_workqueue(afs_async_calls);
+	net->socket->sk->sk_user_data = NULL;
 	sock_release(net->socket);
+	key_put(net->fs_cm_token_key);
 
 	_debug("dework");
 	_leave("");
@@ -148,7 +168,9 @@ static struct afs_call *afs_alloc_call(struct afs_net *net,
 	call->net = net;
 	call->debug_id = atomic_inc_return(&rxrpc_debug_id);
 	refcount_set(&call->ref, 1);
-	INIT_WORK(&call->async_work, afs_process_async_call);
+	INIT_WORK(&call->async_work, type->async_rx ?: afs_process_async_call);
+	INIT_WORK(&call->work, call->type->work);
+	INIT_WORK(&call->free_work, afs_deferred_free_worker);
 	init_waitqueue_head(&call->waitq);
 	spin_lock_init(&call->state_lock);
 	call->iter = &call->def_iter;
@@ -159,6 +181,36 @@ static struct afs_call *afs_alloc_call(struct afs_net *net,
 	return call;
 }
 
+static void afs_free_call(struct afs_call *call)
+{
+	struct afs_net *net = call->net;
+	int o;
+
+	ASSERT(!work_pending(&call->async_work));
+
+	rxrpc_kernel_put_peer(call->peer);
+
+	if (call->rxcall) {
+		rxrpc_kernel_shutdown_call(net->socket, call->rxcall);
+		rxrpc_kernel_put_call(net->socket, call->rxcall);
+		call->rxcall = NULL;
+	}
+	if (call->type->destructor)
+		call->type->destructor(call);
+
+	afs_unuse_server_notime(call->net, call->server, afs_server_trace_unuse_call);
+	kfree(call->request);
+
+	o = atomic_read(&net->nr_outstanding_calls);
+	trace_afs_call(call->debug_id, afs_call_trace_free, 0, o,
+		       __builtin_return_address(0));
+	kfree(call);
+
+	o = atomic_dec_return(&net->nr_outstanding_calls);
+	if (o == 0)
+		wake_up_var(&net->nr_outstanding_calls);
+}
+
 /*
  * Dispose of a reference on a call.
  */
@@ -173,45 +225,34 @@ void afs_put_call(struct afs_call *call)
 	o = atomic_read(&net->nr_outstanding_calls);
 	trace_afs_call(debug_id, afs_call_trace_put, r - 1, o,
 		       __builtin_return_address(0));
+	if (zero)
+		afs_free_call(call);
+}
 
-	if (zero) {
-		ASSERT(!work_pending(&call->async_work));
-		ASSERT(call->type->name != NULL);
-
-		rxrpc_kernel_put_peer(call->peer);
-
-		if (call->rxcall) {
-			rxrpc_kernel_shutdown_call(net->socket, call->rxcall);
-			rxrpc_kernel_put_call(net->socket, call->rxcall);
-			call->rxcall = NULL;
-		}
-		if (call->type->destructor)
-			call->type->destructor(call);
-
-		afs_unuse_server_notime(call->net, call->server, afs_server_trace_put_call);
-		kfree(call->request);
-
-		trace_afs_call(call->debug_id, afs_call_trace_free, 0, o,
-			       __builtin_return_address(0));
-		kfree(call);
+static void afs_deferred_free_worker(struct work_struct *work)
+{
+	struct afs_call *call = container_of(work, struct afs_call, free_work);
 
-		o = atomic_dec_return(&net->nr_outstanding_calls);
-		if (o == 0)
-			wake_up_var(&net->nr_outstanding_calls);
-	}
+	afs_free_call(call);
 }
 
-static struct afs_call *afs_get_call(struct afs_call *call,
-				     enum afs_call_trace why)
+/*
+ * Dispose of a reference on a call, deferring the cleanup to a workqueue
+ * to avoid lock recursion.
+ */
+void afs_deferred_put_call(struct afs_call *call)
 {
-	int r;
-
-	__refcount_inc(&call->ref, &r);
+	struct afs_net *net = call->net;
+	unsigned int debug_id = call->debug_id;
+	bool zero;
+	int r, o;
 
-	trace_afs_call(call->debug_id, why, r + 1,
-		       atomic_read(&call->net->nr_outstanding_calls),
+	zero = __refcount_dec_and_test(&call->ref, &r);
+	o = atomic_read(&net->nr_outstanding_calls);
+	trace_afs_call(debug_id, afs_call_trace_put, r - 1, o,
 		       __builtin_return_address(0));
-	return call;
+	if (zero)
+		schedule_work(&call->free_work);
 }
 
 /*
@@ -220,8 +261,6 @@ static struct afs_call *afs_get_call(struct afs_call *call,
 static void afs_queue_call_work(struct afs_call *call)
 {
 	if (call->type->work) {
-		INIT_WORK(&call->work, call->type->work);
-
 		afs_get_call(call, afs_call_trace_work);
 		if (!queue_work(afs_wq, &call->work))
 			afs_put_call(call);
@@ -396,11 +435,16 @@ void afs_make_call(struct afs_call *call, gfp_t gfp)
 	return;
 
 error_do_abort:
-	if (ret != -ECONNABORTED) {
+	if (ret != -ECONNABORTED)
 		rxrpc_kernel_abort_call(call->net->socket, rxcall,
 					RX_USER_ABORT, ret,
 					afs_abort_send_data_error);
-	} else {
+	if (call->async) {
+		afs_see_call(call, afs_call_trace_async_abort);
+		return;
+	}
+
+	if (ret == -ECONNABORTED) {
 		len = 0;
 		iov_iter_kvec(&msg.msg_iter, ITER_DEST, NULL, 0, 0);
 		rxrpc_kernel_recv_data(call->net->socket, rxcall,
@@ -411,8 +455,10 @@ error_do_abort:
 	call->error = ret;
 	trace_afs_call_done(call);
 error_kill_call:
-	if (call->type->done)
-		call->type->done(call);
+	if (call->async)
+		afs_see_call(call, afs_call_trace_async_kill);
+	if (call->type->immediate_cancel)
+		call->type->immediate_cancel(call);
 
 	/* We need to dispose of the extra ref we grabbed for an async call.
 	 * The call, however, might be queued on afs_async_calls and we need to
@@ -467,7 +513,7 @@ static void afs_log_error(struct afs_call *call, s32 remote_abort)
 /*
  * deliver messages to a call
  */
-static void afs_deliver_to_call(struct afs_call *call)
+void afs_deliver_to_call(struct afs_call *call)
 {
 	enum afs_call_state state;
 	size_t len;
@@ -568,7 +614,6 @@ local_abort:
 	abort_code = 0;
 call_complete:
 	afs_set_call_complete(call, ret, remote_abort);
-	state = AFS_CALL_COMPLETE;
 	goto done;
 }
 
@@ -640,7 +685,8 @@ static void afs_wake_up_call_waiter(struct sock *sk, struct rxrpc_call *rxcall,
 }
 
 /*
- * wake up an asynchronous call
+ * Wake up an asynchronous call.  The caller is holding the call notify
+ * spinlock around this, so we can't call afs_put_call().
  */
 static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall,
 				   unsigned long call_user_ID)
@@ -657,7 +703,7 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall,
 			       __builtin_return_address(0));
 
 		if (!queue_work(afs_async_calls, &call->async_work))
-			afs_put_call(call);
+			afs_deferred_put_call(call);
 	}
 }
 
@@ -711,7 +757,6 @@ void afs_charge_preallocation(struct work_struct *work)
 
 		if (rxrpc_kernel_charge_accept(net->socket,
 					       afs_wake_up_async_call,
-					       afs_rx_attach,
 					       (unsigned long)call,
 					       GFP_KERNEL,
 					       call->debug_id) < 0)
@@ -739,8 +784,14 @@ static void afs_rx_discard_new_call(struct rxrpc_call *rxcall,
 static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall,
 			    unsigned long user_call_ID)
 {
+	struct afs_call *call = (struct afs_call *)user_call_ID;
 	struct afs_net *net = afs_sock2net(sk);
 
+	call->peer = rxrpc_kernel_get_call_peer(sk->sk_socket, call->rxcall);
+	call->server = afs_find_server(call->peer);
+	if (!call->server)
+		trace_afs_cm_no_server(call, rxrpc_kernel_remote_srx(call->peer));
+
 	queue_work(afs_wq, &net->charge_preallocation_work);
 }
 
@@ -767,9 +818,14 @@ static int afs_deliver_cm_op_id(struct afs_call *call)
 	if (!afs_cm_incoming_call(call))
 		return -ENOTSUPP;
 
+	call->security_ix = rxrpc_kernel_query_call_security(call->rxcall,
+							     &call->service_id,
+							     &call->enctype);
+
 	trace_afs_cb_call(call);
+	call->work.func = call->type->work;
 
-	/* pass responsibility for the remainer of this message off to the
+	/* pass responsibility for the remainder of this message off to the
 	 * cache manager op */
 	return call->type->deliver(call);
 }
@@ -918,3 +974,13 @@ noinline int afs_protocol_error(struct afs_call *call,
 		call->unmarshalling_error = true;
 	return -EBADMSG;
 }
+
+/*
+ * Wake up OOB notification processing.
+ */
+static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob)
+{
+	struct afs_net *net = sk->sk_user_data;
+
+	schedule_work(&net->rx_oob_work);
+}
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 6a7744c9e2a2..55ddce94af03 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -16,6 +16,31 @@
 
 static DEFINE_HASHTABLE(afs_permits_cache, 10);
 static DEFINE_SPINLOCK(afs_permits_lock);
+static DEFINE_MUTEX(afs_key_lock);
+
+/*
+ * Allocate a key to use as a placeholder for anonymous user security.
+ */
+static int afs_alloc_anon_key(struct afs_cell *cell)
+{
+	struct key *key;
+
+	mutex_lock(&afs_key_lock);
+	key = cell->anonymous_key;
+	if (!key) {
+		key = rxrpc_get_null_key(cell->key_desc);
+		if (!IS_ERR(key))
+			cell->anonymous_key = key;
+	}
+	mutex_unlock(&afs_key_lock);
+
+	if (IS_ERR(key))
+		return PTR_ERR(key);
+
+	_debug("anon key %p{%x}",
+	       cell->anonymous_key, key_serial(cell->anonymous_key));
+	return 0;
+}
 
 /*
  * get a key
@@ -23,11 +48,12 @@ static DEFINE_SPINLOCK(afs_permits_lock);
 struct key *afs_request_key(struct afs_cell *cell)
 {
 	struct key *key;
+	int ret;
 
-	_enter("{%x}", key_serial(cell->anonymous_key));
+	_enter("{%s}", cell->key_desc);
 
-	_debug("key %s", cell->anonymous_key->description);
-	key = request_key_net(&key_type_rxrpc, cell->anonymous_key->description,
+	_debug("key %s", cell->key_desc);
+	key = request_key_net(&key_type_rxrpc, cell->key_desc,
 			      cell->net->net, NULL);
 	if (IS_ERR(key)) {
 		if (PTR_ERR(key) != -ENOKEY) {
@@ -35,6 +61,12 @@ struct key *afs_request_key(struct afs_cell *cell)
 			return key;
 		}
 
+		if (!cell->anonymous_key) {
+			ret = afs_alloc_anon_key(cell);
+			if (ret < 0)
+				return ERR_PTR(ret);
+		}
+
 		/* act as anonymous user */
 		_leave(" = {%x} [anon]", key_serial(cell->anonymous_key));
 		return key_get(cell->anonymous_key);
@@ -52,11 +84,10 @@ struct key *afs_request_key_rcu(struct afs_cell *cell)
 {
 	struct key *key;
 
-	_enter("{%x}", key_serial(cell->anonymous_key));
+	_enter("{%s}", cell->key_desc);
 
-	_debug("key %s", cell->anonymous_key->description);
-	key = request_key_net_rcu(&key_type_rxrpc,
-				  cell->anonymous_key->description,
+	_debug("key %s", cell->key_desc);
+	key = request_key_net_rcu(&key_type_rxrpc, cell->key_desc,
 				  cell->net->net);
 	if (IS_ERR(key)) {
 		if (PTR_ERR(key) != -ENOKEY) {
@@ -65,6 +96,8 @@ struct key *afs_request_key_rcu(struct afs_cell *cell)
 		}
 
 		/* act as anonymous user */
+		if (!cell->anonymous_key)
+			return NULL; /* Need to allocate */
 		_leave(" = {%x} [anon]", key_serial(cell->anonymous_key));
 		return key_get(cell->anonymous_key);
 	} else {
@@ -408,7 +441,7 @@ int afs_permission(struct mnt_idmap *idmap, struct inode *inode,
 
 	if (mask & MAY_NOT_BLOCK) {
 		key = afs_request_key_rcu(vnode->volume->cell);
-		if (IS_ERR(key))
+		if (IS_ERR_OR_NULL(key))
 			return -ECHILD;
 
 		ret = -ECHILD;
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 038f9d0ae3af..c4428ebddb1d 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -14,188 +14,104 @@
 static unsigned afs_server_gc_delay = 10;	/* Server record timeout in seconds */
 static atomic_t afs_server_debug_id;
 
-static struct afs_server *afs_maybe_use_server(struct afs_server *,
-					       enum afs_server_trace);
 static void __afs_put_server(struct afs_net *, struct afs_server *);
+static void afs_server_timer(struct timer_list *timer);
+static void afs_server_destroyer(struct work_struct *work);
 
 /*
  * Find a server by one of its addresses.
  */
-struct afs_server *afs_find_server(struct afs_net *net, const struct rxrpc_peer *peer)
+struct afs_server *afs_find_server(const struct rxrpc_peer *peer)
 {
-	const struct afs_endpoint_state *estate;
-	const struct afs_addr_list *alist;
-	struct afs_server *server = NULL;
-	unsigned int i;
-	int seq = 1;
+	struct afs_server *server = (struct afs_server *)rxrpc_kernel_get_peer_data(peer);
 
-	rcu_read_lock();
-
-	do {
-		if (server)
-			afs_unuse_server_notime(net, server, afs_server_trace_put_find_rsq);
-		server = NULL;
-		seq++; /* 2 on the 1st/lockless path, otherwise odd */
-		read_seqbegin_or_lock(&net->fs_addr_lock, &seq);
-
-		hlist_for_each_entry_rcu(server, &net->fs_addresses, addr_link) {
-			estate = rcu_dereference(server->endpoint_state);
-			alist = estate->addresses;
-			for (i = 0; i < alist->nr_addrs; i++)
-				if (alist->addrs[i].peer == peer)
-					goto found;
-		}
-
-		server = NULL;
-		continue;
-	found:
-		server = afs_maybe_use_server(server, afs_server_trace_get_by_addr);
-
-	} while (need_seqretry(&net->fs_addr_lock, seq));
-
-	done_seqretry(&net->fs_addr_lock, seq);
-
-	rcu_read_unlock();
-	return server;
+	if (!server)
+		return NULL;
+	return afs_use_server(server, false, afs_server_trace_use_cm_call);
 }
 
 /*
- * Look up a server by its UUID and mark it active.
+ * Look up a server by its UUID and mark it active.  The caller must hold
+ * cell->fs_lock.
  */
-struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uuid)
+static struct afs_server *afs_find_server_by_uuid(struct afs_cell *cell, const uuid_t *uuid)
 {
-	struct afs_server *server = NULL;
+	struct afs_server *server;
 	struct rb_node *p;
-	int diff, seq = 1;
+	int diff;
 
 	_enter("%pU", uuid);
 
-	do {
-		/* Unfortunately, rbtree walking doesn't give reliable results
-		 * under just the RCU read lock, so we have to check for
-		 * changes.
-		 */
-		if (server)
-			afs_unuse_server(net, server, afs_server_trace_put_uuid_rsq);
-		server = NULL;
-		seq++; /* 2 on the 1st/lockless path, otherwise odd */
-		read_seqbegin_or_lock(&net->fs_lock, &seq);
-
-		p = net->fs_servers.rb_node;
-		while (p) {
-			server = rb_entry(p, struct afs_server, uuid_rb);
-
-			diff = memcmp(uuid, &server->uuid, sizeof(*uuid));
-			if (diff < 0) {
-				p = p->rb_left;
-			} else if (diff > 0) {
-				p = p->rb_right;
-			} else {
-				afs_use_server(server, afs_server_trace_get_by_uuid);
-				break;
-			}
-
-			server = NULL;
-		}
-	} while (need_seqretry(&net->fs_lock, seq));
+	p = cell->fs_servers.rb_node;
+	while (p) {
+		server = rb_entry(p, struct afs_server, uuid_rb);
 
-	done_seqretry(&net->fs_lock, seq);
+		diff = memcmp(uuid, &server->uuid, sizeof(*uuid));
+		if (diff < 0) {
+			p = p->rb_left;
+		} else if (diff > 0) {
+			p = p->rb_right;
+		} else {
+			if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags))
+				return NULL; /* Need a write lock */
+			afs_use_server(server, true, afs_server_trace_use_by_uuid);
+			return server;
+		}
+	}
 
-	_leave(" = %p", server);
-	return server;
+	return NULL;
 }
 
 /*
- * Install a server record in the namespace tree.  If there's a clash, we stick
- * it into a list anchored on whichever afs_server struct is actually in the
- * tree.
+ * Install a server record in the cell tree.  The caller must hold an exclusive
+ * lock on cell->fs_lock.
  */
 static struct afs_server *afs_install_server(struct afs_cell *cell,
-					     struct afs_server *candidate)
+					     struct afs_server **candidate)
 {
-	const struct afs_endpoint_state *estate;
-	const struct afs_addr_list *alist;
-	struct afs_server *server, *next;
+	struct afs_server *server;
 	struct afs_net *net = cell->net;
 	struct rb_node **pp, *p;
 	int diff;
 
 	_enter("%p", candidate);
 
-	write_seqlock(&net->fs_lock);
-
 	/* Firstly install the server in the UUID lookup tree */
-	pp = &net->fs_servers.rb_node;
+	pp = &cell->fs_servers.rb_node;
 	p = NULL;
 	while (*pp) {
 		p = *pp;
 		_debug("- consider %p", p);
 		server = rb_entry(p, struct afs_server, uuid_rb);
-		diff = memcmp(&candidate->uuid, &server->uuid, sizeof(uuid_t));
-		if (diff < 0) {
+		diff = memcmp(&(*candidate)->uuid, &server->uuid, sizeof(uuid_t));
+		if (diff < 0)
 			pp = &(*pp)->rb_left;
-		} else if (diff > 0) {
+		else if (diff > 0)
 			pp = &(*pp)->rb_right;
-		} else {
-			if (server->cell == cell)
-				goto exists;
-
-			/* We have the same UUID representing servers in
-			 * different cells.  Append the new server to the list.
-			 */
-			for (;;) {
-				next = rcu_dereference_protected(
-					server->uuid_next,
-					lockdep_is_held(&net->fs_lock.lock));
-				if (!next)
-					break;
-				server = next;
-			}
-			rcu_assign_pointer(server->uuid_next, candidate);
-			candidate->uuid_prev = server;
-			server = candidate;
-			goto added_dup;
-		}
+		else
+			goto exists;
 	}
 
-	server = candidate;
+	server = *candidate;
+	*candidate = NULL;
 	rb_link_node(&server->uuid_rb, p, pp);
-	rb_insert_color(&server->uuid_rb, &net->fs_servers);
+	rb_insert_color(&server->uuid_rb, &cell->fs_servers);
+	write_seqlock(&net->fs_lock);
 	hlist_add_head_rcu(&server->proc_link, &net->fs_proc);
+	write_sequnlock(&net->fs_lock);
 
-added_dup:
-	write_seqlock(&net->fs_addr_lock);
-	estate = rcu_dereference_protected(server->endpoint_state,
-					   lockdep_is_held(&net->fs_addr_lock.lock));
-	alist = estate->addresses;
-
-	/* Secondly, if the server has any IPv4 and/or IPv6 addresses, install
-	 * it in the IPv4 and/or IPv6 reverse-map lists.
-	 *
-	 * TODO: For speed we want to use something other than a flat list
-	 * here; even sorting the list in terms of lowest address would help a
-	 * bit, but anything we might want to do gets messy and memory
-	 * intensive.
-	 */
-	if (alist->nr_addrs > 0)
-		hlist_add_head_rcu(&server->addr_link, &net->fs_addresses);
-
-	write_sequnlock(&net->fs_addr_lock);
+	afs_get_cell(cell, afs_cell_trace_get_server);
 
 exists:
-	afs_get_server(server, afs_server_trace_get_install);
-	write_sequnlock(&net->fs_lock);
+	afs_use_server(server, true, afs_server_trace_use_install);
 	return server;
 }
 
 /*
- * Allocate a new server record and mark it active.
+ * Allocate a new server record and mark it as active but uncreated.
  */
-static struct afs_server *afs_alloc_server(struct afs_cell *cell,
-					   const uuid_t *uuid,
-					   struct afs_addr_list *alist)
+static struct afs_server *afs_alloc_server(struct afs_cell *cell, const uuid_t *uuid)
 {
-	struct afs_endpoint_state *estate;
 	struct afs_server *server;
 	struct afs_net *net = cell->net;
 
@@ -203,65 +119,50 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
 
 	server = kzalloc(sizeof(struct afs_server), GFP_KERNEL);
 	if (!server)
-		goto enomem;
-
-	estate = kzalloc(sizeof(struct afs_endpoint_state), GFP_KERNEL);
-	if (!estate)
-		goto enomem_server;
+		return NULL;
 
 	refcount_set(&server->ref, 1);
-	atomic_set(&server->active, 1);
+	atomic_set(&server->active, 0);
+	__set_bit(AFS_SERVER_FL_UNCREATED, &server->flags);
 	server->debug_id = atomic_inc_return(&afs_server_debug_id);
-	server->addr_version = alist->version;
 	server->uuid = *uuid;
 	rwlock_init(&server->fs_lock);
+	INIT_WORK(&server->destroyer, &afs_server_destroyer);
+	timer_setup(&server->timer, afs_server_timer, 0);
 	INIT_LIST_HEAD(&server->volumes);
 	init_waitqueue_head(&server->probe_wq);
+	mutex_init(&server->cm_token_lock);
 	INIT_LIST_HEAD(&server->probe_link);
+	INIT_HLIST_NODE(&server->proc_link);
 	spin_lock_init(&server->probe_lock);
 	server->cell = cell;
 	server->rtt = UINT_MAX;
 	server->service_id = FS_SERVICE;
-
 	server->probe_counter = 1;
 	server->probed_at = jiffies - LONG_MAX / 2;
-	refcount_set(&estate->ref, 1);
-	estate->addresses = alist;
-	estate->server_id = server->debug_id;
-	estate->probe_seq = 1;
-	rcu_assign_pointer(server->endpoint_state, estate);
 
 	afs_inc_servers_outstanding(net);
-	trace_afs_server(server->debug_id, 1, 1, afs_server_trace_alloc);
-	trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref),
-			 afs_estate_trace_alloc_server);
 	_leave(" = %p", server);
 	return server;
-
-enomem_server:
-	kfree(server);
-enomem:
-	_leave(" = NULL [nomem]");
-	return NULL;
 }
 
 /*
  * Look up an address record for a server
  */
-static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell,
-						 struct key *key, const uuid_t *uuid)
+static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_server *server,
+						 struct key *key)
 {
 	struct afs_vl_cursor vc;
 	struct afs_addr_list *alist = NULL;
 	int ret;
 
 	ret = -ERESTARTSYS;
-	if (afs_begin_vlserver_operation(&vc, cell, key)) {
+	if (afs_begin_vlserver_operation(&vc, server->cell, key)) {
 		while (afs_select_vlserver(&vc)) {
 			if (test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags))
-				alist = afs_yfsvl_get_endpoints(&vc, uuid);
+				alist = afs_yfsvl_get_endpoints(&vc, &server->uuid);
 			else
-				alist = afs_vl_get_addrs_u(&vc, uuid);
+				alist = afs_vl_get_addrs_u(&vc, &server->uuid);
 		}
 
 		ret = afs_end_vlserver_operation(&vc);
@@ -271,72 +172,122 @@ static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell,
 }
 
 /*
- * Get or create a fileserver record.
+ * Get or create a fileserver record and return it with an active-use count on
+ * it.
  */
 struct afs_server *afs_lookup_server(struct afs_cell *cell, struct key *key,
 				     const uuid_t *uuid, u32 addr_version)
 {
-	struct afs_addr_list *alist;
-	struct afs_server *server, *candidate;
+	struct afs_addr_list *alist = NULL;
+	struct afs_server *server, *candidate = NULL;
+	bool creating = false;
+	int ret;
 
 	_enter("%p,%pU", cell->net, uuid);
 
-	server = afs_find_server_by_uuid(cell->net, uuid);
+	down_read(&cell->fs_lock);
+	server = afs_find_server_by_uuid(cell, uuid);
+	/* Won't see servers marked uncreated. */
+	up_read(&cell->fs_lock);
+
 	if (server) {
+		timer_delete_sync(&server->timer);
+		if (test_bit(AFS_SERVER_FL_CREATING, &server->flags))
+			goto wait_for_creation;
 		if (server->addr_version != addr_version)
 			set_bit(AFS_SERVER_FL_NEEDS_UPDATE, &server->flags);
 		return server;
 	}
 
-	alist = afs_vl_lookup_addrs(cell, key, uuid);
-	if (IS_ERR(alist))
-		return ERR_CAST(alist);
-
-	candidate = afs_alloc_server(cell, uuid, alist);
+	candidate = afs_alloc_server(cell, uuid);
 	if (!candidate) {
 		afs_put_addrlist(alist, afs_alist_trace_put_server_oom);
 		return ERR_PTR(-ENOMEM);
 	}
 
-	server = afs_install_server(cell, candidate);
-	if (server != candidate) {
-		afs_put_addrlist(alist, afs_alist_trace_put_server_dup);
+	down_write(&cell->fs_lock);
+	server = afs_install_server(cell, &candidate);
+	if (test_bit(AFS_SERVER_FL_CREATING, &server->flags)) {
+		/* We need to wait for creation to complete. */
+		up_write(&cell->fs_lock);
+		goto wait_for_creation;
+	}
+	if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags)) {
+		set_bit(AFS_SERVER_FL_CREATING, &server->flags);
+		clear_bit(AFS_SERVER_FL_UNCREATED, &server->flags);
+		creating = true;
+	}
+	up_write(&cell->fs_lock);
+	timer_delete_sync(&server->timer);
+
+	/* If we get to create the server, we look up the addresses and then
+	 * immediately dispatch an asynchronous probe to each interface on the
+	 * fileserver.  This will make sure the repeat-probing service is
+	 * started.
+	 */
+	if (creating) {
+		alist = afs_vl_lookup_addrs(server, key);
+		if (IS_ERR(alist)) {
+			ret = PTR_ERR(alist);
+			goto create_failed;
+		}
+
+		ret = afs_fs_probe_fileserver(cell->net, server, alist, key);
+		if (ret)
+			goto create_failed;
+
+		clear_and_wake_up_bit(AFS_SERVER_FL_CREATING, &server->flags);
+	}
+
+out:
+	afs_put_addrlist(alist, afs_alist_trace_put_server_create);
+	if (candidate) {
+		kfree(rcu_access_pointer(server->endpoint_state));
 		kfree(candidate);
-	} else {
-		/* Immediately dispatch an asynchronous probe to each interface
-		 * on the fileserver.  This will make sure the repeat-probing
-		 * service is started.
-		 */
-		afs_fs_probe_fileserver(cell->net, server, alist, key);
+		afs_dec_servers_outstanding(cell->net);
+	}
+	return server ?: ERR_PTR(ret);
+
+wait_for_creation:
+	afs_see_server(server, afs_server_trace_wait_create);
+	wait_on_bit(&server->flags, AFS_SERVER_FL_CREATING, TASK_UNINTERRUPTIBLE);
+	if (test_bit_acquire(AFS_SERVER_FL_UNCREATED, &server->flags)) {
+		/* Barrier: read flag before error */
+		ret = READ_ONCE(server->create_error);
+		afs_put_server(cell->net, server, afs_server_trace_unuse_create_fail);
+		server = NULL;
+		goto out;
 	}
 
-	return server;
-}
+	ret = 0;
+	goto out;
 
-/*
- * Set the server timer to fire after a given delay, assuming it's not already
- * set for an earlier time.
- */
-static void afs_set_server_timer(struct afs_net *net, time64_t delay)
-{
-	if (net->live) {
-		afs_inc_servers_outstanding(net);
-		if (timer_reduce(&net->fs_timer, jiffies + delay * HZ))
-			afs_dec_servers_outstanding(net);
+create_failed:
+	down_write(&cell->fs_lock);
+
+	WRITE_ONCE(server->create_error, ret);
+	smp_wmb(); /* Barrier: set error before flag. */
+	set_bit(AFS_SERVER_FL_UNCREATED, &server->flags);
+
+	clear_and_wake_up_bit(AFS_SERVER_FL_CREATING, &server->flags);
+
+	if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags)) {
+		clear_bit(AFS_SERVER_FL_UNCREATED, &server->flags);
+		creating = true;
 	}
+	afs_unuse_server(cell->net, server, afs_server_trace_unuse_create_fail);
+	server = NULL;
+
+	up_write(&cell->fs_lock);
+	goto out;
 }
 
 /*
- * Server management timer.  We have an increment on fs_outstanding that we
- * need to pass along to the work item.
+ * Set/reduce a server's timer.
  */
-void afs_servers_timer(struct timer_list *timer)
+static void afs_set_server_timer(struct afs_server *server, unsigned int delay_secs)
 {
-	struct afs_net *net = container_of(timer, struct afs_net, fs_timer);
-
-	_enter("");
-	if (!queue_work(afs_wq, &net->fs_manager))
-		afs_dec_servers_outstanding(net);
+	mod_timer(&server->timer, jiffies + delay_secs * HZ);
 }
 
 /*
@@ -355,32 +306,20 @@ struct afs_server *afs_get_server(struct afs_server *server,
 }
 
 /*
- * Try to get a reference on a server object.
+ * Get an active count on a server object and maybe remove from the inactive
+ * list.
  */
-static struct afs_server *afs_maybe_use_server(struct afs_server *server,
-					       enum afs_server_trace reason)
-{
-	unsigned int a;
-	int r;
-
-	if (!__refcount_inc_not_zero(&server->ref, &r))
-		return NULL;
-
-	a = atomic_inc_return(&server->active);
-	trace_afs_server(server->debug_id, r + 1, a, reason);
-	return server;
-}
-
-/*
- * Get an active count on a server object.
- */
-struct afs_server *afs_use_server(struct afs_server *server, enum afs_server_trace reason)
+struct afs_server *afs_use_server(struct afs_server *server, bool activate,
+				  enum afs_server_trace reason)
 {
 	unsigned int a;
 	int r;
 
 	__refcount_inc(&server->ref, &r);
 	a = atomic_inc_return(&server->active);
+	if (a == 1 && activate &&
+	    !test_bit(AFS_SERVER_FL_EXPIRED, &server->flags))
+		timer_delete(&server->timer);
 
 	trace_afs_server(server->debug_id, r + 1, a, reason);
 	return server;
@@ -392,13 +331,14 @@ struct afs_server *afs_use_server(struct afs_server *server, enum afs_server_tra
 void afs_put_server(struct afs_net *net, struct afs_server *server,
 		    enum afs_server_trace reason)
 {
-	unsigned int a, debug_id = server->debug_id;
+	unsigned int a, debug_id;
 	bool zero;
 	int r;
 
 	if (!server)
 		return;
 
+	debug_id = server->debug_id;
 	a = atomic_read(&server->active);
 	zero = __refcount_dec_and_test(&server->ref, &r);
 	trace_afs_server(debug_id, r - 1, a, reason);
@@ -413,13 +353,16 @@ void afs_put_server(struct afs_net *net, struct afs_server *server,
 void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server,
 			     enum afs_server_trace reason)
 {
-	if (server) {
-		unsigned int active = atomic_dec_return(&server->active);
+	if (!server)
+		return;
 
-		if (active == 0)
-			afs_set_server_timer(net, afs_server_gc_delay);
-		afs_put_server(net, server, reason);
+	if (atomic_dec_and_test(&server->active)) {
+		if (test_bit(AFS_SERVER_FL_EXPIRED, &server->flags) ||
+		    READ_ONCE(server->cell->state) >= AFS_CELL_REMOVING)
+			schedule_work(&server->destroyer);
 	}
+
+	afs_put_server(net, server, reason);
 }
 
 /*
@@ -428,10 +371,22 @@ void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server,
 void afs_unuse_server(struct afs_net *net, struct afs_server *server,
 		      enum afs_server_trace reason)
 {
-	if (server) {
-		server->unuse_time = ktime_get_real_seconds();
-		afs_unuse_server_notime(net, server, reason);
+	if (!server)
+		return;
+
+	if (atomic_dec_and_test(&server->active)) {
+		if (!test_bit(AFS_SERVER_FL_EXPIRED, &server->flags) &&
+		    READ_ONCE(server->cell->state) < AFS_CELL_REMOVING) {
+			time64_t unuse_time = ktime_get_real_seconds();
+
+			server->unuse_time = unuse_time;
+			afs_set_server_timer(server, afs_server_gc_delay);
+		} else {
+			schedule_work(&server->destroyer);
+		}
 	}
+
+	afs_put_server(net, server, reason);
 }
 
 static void afs_server_rcu(struct rcu_head *rcu)
@@ -442,6 +397,8 @@ static void afs_server_rcu(struct rcu_head *rcu)
 			 atomic_read(&server->active), afs_server_trace_free);
 	afs_put_endpoint_state(rcu_access_pointer(server->endpoint_state),
 			       afs_estate_trace_put_server);
+	afs_put_cell(server->cell, afs_cell_trace_put_server);
+	kfree(server->cm_rxgk_appdata.data);
 	kfree(server);
 }
 
@@ -460,159 +417,119 @@ static void afs_give_up_callbacks(struct afs_net *net, struct afs_server *server
 }
 
 /*
- * destroy a dead server
+ * Check to see if the server record has expired.
  */
-static void afs_destroy_server(struct afs_net *net, struct afs_server *server)
+static bool afs_has_server_expired(const struct afs_server *server)
 {
-	if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags))
-		afs_give_up_callbacks(net, server);
+	time64_t expires_at;
 
-	afs_put_server(net, server, afs_server_trace_destroy);
+	if (atomic_read(&server->active))
+		return false;
+
+	if (server->cell->net->live ||
+	    server->cell->state >= AFS_CELL_REMOVING) {
+		trace_afs_server(server->debug_id, refcount_read(&server->ref),
+				 0, afs_server_trace_purging);
+		return true;
+	}
+
+	expires_at = server->unuse_time;
+	if (!test_bit(AFS_SERVER_FL_VL_FAIL, &server->flags) &&
+	    !test_bit(AFS_SERVER_FL_NOT_FOUND, &server->flags))
+		expires_at += afs_server_gc_delay;
+
+	return ktime_get_real_seconds() > expires_at;
 }
 
 /*
- * Garbage collect any expired servers.
+ * Remove a server record from it's parent cell's database.
  */
-static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list)
+static bool afs_remove_server_from_cell(struct afs_server *server)
 {
-	struct afs_server *server, *next, *prev;
-	int active;
-
-	while ((server = gc_list)) {
-		gc_list = server->gc_next;
-
-		write_seqlock(&net->fs_lock);
-
-		active = atomic_read(&server->active);
-		if (active == 0) {
-			trace_afs_server(server->debug_id, refcount_read(&server->ref),
-					 active, afs_server_trace_gc);
-			next = rcu_dereference_protected(
-				server->uuid_next, lockdep_is_held(&net->fs_lock.lock));
-			prev = server->uuid_prev;
-			if (!prev) {
-				/* The one at the front is in the tree */
-				if (!next) {
-					rb_erase(&server->uuid_rb, &net->fs_servers);
-				} else {
-					rb_replace_node_rcu(&server->uuid_rb,
-							    &next->uuid_rb,
-							    &net->fs_servers);
-					next->uuid_prev = NULL;
-				}
-			} else {
-				/* This server is not at the front */
-				rcu_assign_pointer(prev->uuid_next, next);
-				if (next)
-					next->uuid_prev = prev;
-			}
-
-			list_del(&server->probe_link);
-			hlist_del_rcu(&server->proc_link);
-			if (!hlist_unhashed(&server->addr_link))
-				hlist_del_rcu(&server->addr_link);
-		}
-		write_sequnlock(&net->fs_lock);
+	struct afs_cell *cell = server->cell;
+
+	down_write(&cell->fs_lock);
 
-		if (active == 0)
-			afs_destroy_server(net, server);
+	if (!afs_has_server_expired(server)) {
+		up_write(&cell->fs_lock);
+		return false;
 	}
+
+	set_bit(AFS_SERVER_FL_EXPIRED, &server->flags);
+	_debug("expire %pU %u", &server->uuid, atomic_read(&server->active));
+	afs_see_server(server, afs_server_trace_see_expired);
+	rb_erase(&server->uuid_rb, &cell->fs_servers);
+	up_write(&cell->fs_lock);
+	return true;
 }
 
-/*
- * Manage the records of servers known to be within a network namespace.  This
- * includes garbage collecting unused servers.
- *
- * Note also that we were given an increment on net->servers_outstanding by
- * whoever queued us that we need to deal with before returning.
- */
-void afs_manage_servers(struct work_struct *work)
+static void afs_server_destroyer(struct work_struct *work)
 {
-	struct afs_net *net = container_of(work, struct afs_net, fs_manager);
-	struct afs_server *gc_list = NULL;
-	struct rb_node *cursor;
-	time64_t now = ktime_get_real_seconds(), next_manage = TIME64_MAX;
-	bool purging = !net->live;
-
-	_enter("");
+	struct afs_endpoint_state *estate;
+	struct afs_server *server = container_of(work, struct afs_server, destroyer);
+	struct afs_net *net = server->cell->net;
 
-	/* Trawl the server list looking for servers that have expired from
-	 * lack of use.
-	 */
-	read_seqlock_excl(&net->fs_lock);
+	afs_see_server(server, afs_server_trace_see_destroyer);
 
-	for (cursor = rb_first(&net->fs_servers); cursor; cursor = rb_next(cursor)) {
-		struct afs_server *server =
-			rb_entry(cursor, struct afs_server, uuid_rb);
-		int active = atomic_read(&server->active);
+	if (test_bit(AFS_SERVER_FL_EXPIRED, &server->flags))
+		return;
 
-		_debug("manage %pU %u", &server->uuid, active);
+	if (!afs_remove_server_from_cell(server))
+		return;
 
-		if (purging) {
-			trace_afs_server(server->debug_id, refcount_read(&server->ref),
-					 active, afs_server_trace_purging);
-			if (active != 0)
-				pr_notice("Can't purge s=%08x\n", server->debug_id);
-		}
+	timer_shutdown_sync(&server->timer);
+	cancel_work(&server->destroyer);
 
-		if (active == 0) {
-			time64_t expire_at = server->unuse_time;
-
-			if (!test_bit(AFS_SERVER_FL_VL_FAIL, &server->flags) &&
-			    !test_bit(AFS_SERVER_FL_NOT_FOUND, &server->flags))
-				expire_at += afs_server_gc_delay;
-			if (purging || expire_at <= now) {
-				server->gc_next = gc_list;
-				gc_list = server;
-			} else if (expire_at < next_manage) {
-				next_manage = expire_at;
-			}
-		}
-	}
+	if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags))
+		afs_give_up_callbacks(net, server);
 
-	read_sequnlock_excl(&net->fs_lock);
+	/* Unbind the rxrpc_peer records from the server. */
+	estate = rcu_access_pointer(server->endpoint_state);
+	if (estate)
+		afs_set_peer_appdata(server, estate->addresses, NULL);
 
-	/* Update the timer on the way out.  We have to pass an increment on
-	 * servers_outstanding in the namespace that we are in to the timer or
-	 * the work scheduler.
-	 */
-	if (!purging && next_manage < TIME64_MAX) {
-		now = ktime_get_real_seconds();
+	write_seqlock(&net->fs_lock);
+	list_del_init(&server->probe_link);
+	if (!hlist_unhashed(&server->proc_link))
+		hlist_del_rcu(&server->proc_link);
+	write_sequnlock(&net->fs_lock);
 
-		if (next_manage - now <= 0) {
-			if (queue_work(afs_wq, &net->fs_manager))
-				afs_inc_servers_outstanding(net);
-		} else {
-			afs_set_server_timer(net, next_manage - now);
-		}
-	}
+	afs_put_server(net, server, afs_server_trace_destroy);
+}
 
-	afs_gc_servers(net, gc_list);
+static void afs_server_timer(struct timer_list *timer)
+{
+	struct afs_server *server = container_of(timer, struct afs_server, timer);
 
-	afs_dec_servers_outstanding(net);
-	_leave(" [%d]", atomic_read(&net->servers_outstanding));
+	afs_see_server(server, afs_server_trace_see_timer);
+	if (!test_bit(AFS_SERVER_FL_EXPIRED, &server->flags))
+		schedule_work(&server->destroyer);
 }
 
-static void afs_queue_server_manager(struct afs_net *net)
+/*
+ * Wake up all the servers in a cell so that they can purge themselves.
+ */
+void afs_purge_servers(struct afs_cell *cell)
 {
-	afs_inc_servers_outstanding(net);
-	if (!queue_work(afs_wq, &net->fs_manager))
-		afs_dec_servers_outstanding(net);
+	struct afs_server *server;
+	struct rb_node *rb;
+
+	down_read(&cell->fs_lock);
+	for (rb = rb_first(&cell->fs_servers); rb; rb = rb_next(rb)) {
+		server = rb_entry(rb, struct afs_server, uuid_rb);
+		afs_see_server(server, afs_server_trace_see_purge);
+		schedule_work(&server->destroyer);
+	}
+	up_read(&cell->fs_lock);
 }
 
 /*
- * Purge list of servers.
+ * Wait for outstanding servers.
  */
-void afs_purge_servers(struct afs_net *net)
+void afs_wait_for_servers(struct afs_net *net)
 {
 	_enter("");
 
-	if (del_timer_sync(&net->fs_timer))
-		afs_dec_servers_outstanding(net);
-
-	afs_queue_server_manager(net);
-
-	_debug("wait");
 	atomic_dec(&net->servers_outstanding);
 	wait_var_event(&net->servers_outstanding,
 		       !atomic_read(&net->servers_outstanding));
@@ -636,7 +553,7 @@ static noinline bool afs_update_server_record(struct afs_operation *op,
 			 atomic_read(&server->active),
 			 afs_server_trace_update);
 
-	alist = afs_vl_lookup_addrs(op->volume->cell, op->key, &server->uuid);
+	alist = afs_vl_lookup_addrs(server, op->key);
 	if (IS_ERR(alist)) {
 		rcu_read_lock();
 		estate = rcu_dereference(server->endpoint_state);
diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c
index 7e7e567a7f8a..20d5474837df 100644
--- a/fs/afs/server_list.c
+++ b/fs/afs/server_list.c
@@ -16,7 +16,7 @@ void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist)
 	if (slist && refcount_dec_and_test(&slist->usage)) {
 		for (i = 0; i < slist->nr_servers; i++)
 			afs_unuse_server(net, slist->servers[i].server,
-					 afs_server_trace_put_slist);
+					 afs_server_trace_unuse_slist);
 		kfree_rcu(slist, rcu);
 	}
 }
@@ -97,8 +97,8 @@ struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume,
 				break;
 		if (j < slist->nr_servers) {
 			if (slist->servers[j].server == server) {
-				afs_put_server(volume->cell->net, server,
-					       afs_server_trace_put_slist_isort);
+				afs_unuse_server_notime(volume->cell->net, server,
+							afs_server_trace_unuse_slist_isort);
 				continue;
 			}
 
diff --git a/fs/afs/super.c b/fs/afs/super.c
index f3ba1c3e72f5..d672b7ab57ae 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -194,8 +194,6 @@ static int afs_show_options(struct seq_file *m, struct dentry *root)
 
 	if (as->dyn_root)
 		seq_puts(m, ",dyn");
-	if (test_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(d_inode(root))->flags))
-		seq_puts(m, ",autocell");
 	switch (as->flock_mode) {
 	case afs_flock_mode_unset:	break;
 	case afs_flock_mode_local:	p = "local";	break;
@@ -292,13 +290,14 @@ static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param)
 	/* lookup the cell record */
 	if (cellname) {
 		cell = afs_lookup_cell(ctx->net, cellname, cellnamesz,
-				       NULL, false);
+				       NULL, AFS_LOOKUP_CELL_DIRECT_MOUNT,
+				       afs_cell_trace_use_lookup_mount);
 		if (IS_ERR(cell)) {
 			pr_err("kAFS: unable to lookup cell '%*.*s'\n",
 			       cellnamesz, cellnamesz, cellname ?: "");
 			return PTR_ERR(cell);
 		}
-		afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_parse);
+		afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_parse);
 		afs_see_cell(cell, afs_cell_trace_see_source);
 		ctx->cell = cell;
 	}
@@ -395,7 +394,7 @@ static int afs_validate_fc(struct fs_context *fc)
 				ctx->key = NULL;
 				cell = afs_use_cell(ctx->cell->alias_of,
 						    afs_cell_trace_use_fc_alias);
-				afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc);
+				afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_fc);
 				ctx->cell = cell;
 				goto reget_key;
 			}
@@ -468,7 +467,7 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
 
 	/* allocate the root inode and dentry */
 	if (as->dyn_root) {
-		inode = afs_iget_pseudo_dir(sb, true);
+		inode = afs_dynroot_iget_root(sb);
 	} else {
 		sprintf(sb->s_id, "%llu", as->volume->vid);
 		afs_activate_volume(as->volume);
@@ -478,21 +477,15 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
-	if (ctx->autocell || as->dyn_root)
-		set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags);
-
 	ret = -ENOMEM;
 	sb->s_root = d_make_root(inode);
 	if (!sb->s_root)
 		goto error;
 
 	if (as->dyn_root) {
-		sb->s_d_op = &afs_dynroot_dentry_operations;
-		ret = afs_dynroot_populate(sb);
-		if (ret < 0)
-			goto error;
+		set_default_d_op(sb, &afs_dynroot_dentry_operations);
 	} else {
-		sb->s_d_op = &afs_fs_dentry_operations;
+		set_default_d_op(sb, &afs_fs_dentry_operations);
 		rcu_assign_pointer(as->volume->sb, sb);
 	}
 
@@ -527,9 +520,8 @@ static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc)
 static void afs_destroy_sbi(struct afs_super_info *as)
 {
 	if (as) {
-		struct afs_net *net = afs_net(as->net_ns);
 		afs_put_volume(as->volume, afs_volume_trace_put_destroy_sbi);
-		afs_unuse_cell(net, as->cell, afs_cell_trace_unuse_sbi);
+		afs_unuse_cell(as->cell, afs_cell_trace_unuse_sbi);
 		put_net(as->net_ns);
 		kfree(as);
 	}
@@ -539,9 +531,6 @@ static void afs_kill_super(struct super_block *sb)
 {
 	struct afs_super_info *as = AFS_FS_S(sb);
 
-	if (as->dyn_root)
-		afs_dynroot_depopulate(sb);
-
 	/* Clear the callback interests (which will do ilookup5) before
 	 * deactivating the superblock.
 	 */
@@ -615,7 +604,7 @@ static void afs_free_fc(struct fs_context *fc)
 
 	afs_destroy_sbi(fc->s_fs_info);
 	afs_put_volume(ctx->volume, afs_volume_trace_put_free_fc);
-	afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc);
+	afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_fc);
 	key_put(ctx->key);
 	kfree(ctx);
 }
@@ -663,7 +652,7 @@ static void afs_i_init_once(void *_vnode)
 
 	memset(vnode, 0, sizeof(*vnode));
 	inode_init_once(&vnode->netfs.inode);
-	mutex_init(&vnode->io_lock);
+	INIT_LIST_HEAD(&vnode->io_lock_waiters);
 	init_rwsem(&vnode->validate_lock);
 	spin_lock_init(&vnode->wb_lock);
 	spin_lock_init(&vnode->lock);
@@ -696,6 +685,8 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
 	vnode->volume		= NULL;
 	vnode->lock_key		= NULL;
 	vnode->permit_cache	= NULL;
+	vnode->directory	= NULL;
+	vnode->directory_size	= 0;
 
 	vnode->flags		= 1 << AFS_VNODE_UNSET;
 	vnode->lock_state	= AFS_VNODE_LOCK_NONE;
diff --git a/fs/afs/validation.c b/fs/afs/validation.c
index bef8af12ebe2..0ba8336c9025 100644
--- a/fs/afs/validation.c
+++ b/fs/afs/validation.c
@@ -120,22 +120,31 @@
 bool afs_check_validity(const struct afs_vnode *vnode)
 {
 	const struct afs_volume *volume = vnode->volume;
+	enum afs_vnode_invalid_trace trace = afs_vnode_valid_trace;
+	time64_t cb_expires_at = atomic64_read(&vnode->cb_expires_at);
 	time64_t deadline = ktime_get_real_seconds() + 10;
 
 	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
 		return true;
 
-	if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
-	    atomic64_read(&vnode->cb_expires_at)  <= deadline ||
-	    volume->cb_expires_at <= deadline ||
-	    vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) ||
-	    vnode->cb_scrub	  != atomic_read(&volume->cb_scrub) ||
-	    test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
-		_debug("inval");
-		return false;
-	}
-
-	return true;
+	if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break))
+		trace = afs_vnode_invalid_trace_cb_v_break;
+	else if (cb_expires_at == AFS_NO_CB_PROMISE)
+		trace = afs_vnode_invalid_trace_no_cb_promise;
+	else if (cb_expires_at <= deadline)
+		trace = afs_vnode_invalid_trace_expired;
+	else if (volume->cb_expires_at <= deadline)
+		trace = afs_vnode_invalid_trace_vol_expired;
+	else if (vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot))
+		trace = afs_vnode_invalid_trace_cb_ro_snapshot;
+	else if (vnode->cb_scrub != atomic_read(&volume->cb_scrub))
+		trace = afs_vnode_invalid_trace_cb_scrub;
+	else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
+		trace = afs_vnode_invalid_trace_zap_data;
+	else
+		return true;
+	trace_afs_vnode_invalid(vnode, trace);
+	return false;
 }
 
 /*
diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c
index 9f36e14f1c2d..fc9676abd252 100644
--- a/fs/afs/vl_alias.c
+++ b/fs/afs/vl_alias.c
@@ -205,11 +205,11 @@ static int afs_query_for_alias(struct afs_cell *cell, struct key *key)
 			goto is_alias;
 
 		if (mutex_lock_interruptible(&cell->net->proc_cells_lock) < 0) {
-			afs_unuse_cell(cell->net, p, afs_cell_trace_unuse_check_alias);
+			afs_unuse_cell(p, afs_cell_trace_unuse_check_alias);
 			return -ERESTARTSYS;
 		}
 
-		afs_unuse_cell(cell->net, p, afs_cell_trace_unuse_check_alias);
+		afs_unuse_cell(p, afs_cell_trace_unuse_check_alias);
 	}
 
 	mutex_unlock(&cell->net->proc_cells_lock);
@@ -253,6 +253,7 @@ static char *afs_vl_get_cell_name(struct afs_cell *cell, struct key *key)
 static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key)
 {
 	struct afs_cell *master;
+	size_t name_len;
 	char *cell_name;
 
 	cell_name = afs_vl_get_cell_name(cell, key);
@@ -264,8 +265,13 @@ static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key)
 		return 0;
 	}
 
-	master = afs_lookup_cell(cell->net, cell_name, strlen(cell_name),
-				 NULL, false);
+	name_len = strlen(cell_name);
+	if (!name_len || name_len > AFS_MAXCELLNAME)
+		master = ERR_PTR(-EOPNOTSUPP);
+	else
+		master = afs_lookup_cell(cell->net, cell_name, name_len, NULL,
+					 AFS_LOOKUP_CELL_ALIAS_CHECK,
+					 afs_cell_trace_use_lookup_canonical);
 	kfree(cell_name);
 	if (IS_ERR(master))
 		return PTR_ERR(master);
diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
index d8f79f6ada3d..6ad9688d8f4b 100644
--- a/fs/afs/vl_rotate.c
+++ b/fs/afs/vl_rotate.c
@@ -48,7 +48,7 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
 	    cell->dns_expiry <= ktime_get_real_seconds()) {
 		dns_lookup_count = smp_load_acquire(&cell->dns_lookup_count);
 		set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags);
-		afs_queue_cell(cell, afs_cell_trace_get_queue_dns);
+		afs_queue_cell(cell, afs_cell_trace_queue_dns);
 
 		if (cell->dns_source == DNS_RECORD_UNAVAILABLE) {
 			if (wait_var_event_interruptible(
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index cac75f89b64a..3a23c0b08eb6 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -370,6 +370,7 @@ static const struct afs_call_type afs_RXVLGetCapabilities = {
 	.name		= "VL.GetCapabilities",
 	.op		= afs_VL_GetCapabilities,
 	.deliver	= afs_deliver_vl_get_capabilities,
+	.immediate_cancel = afs_vlserver_probe_result,
 	.done		= afs_vlserver_probe_result,
 	.destructor	= afs_destroy_vl_get_capabilities,
 };
@@ -697,7 +698,7 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call)
 			return ret;
 
 		namesz = ntohl(call->tmp);
-		if (namesz > AFS_MAXCELLNAME)
+		if (namesz > YFS_VL_MAXCELLNAME)
 			return afs_protocol_error(call, afs_eproto_cellname_len);
 		paddedsz = (namesz + 3) & ~3;
 		call->count = namesz;
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index af3a3f57c1b3..0efff3d25133 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -10,6 +10,7 @@
 #include "internal.h"
 
 static unsigned __read_mostly afs_volume_record_life = 60 * 60;
+static atomic_t afs_volume_debug_id;
 
 static void afs_destroy_volume(struct work_struct *work);
 
@@ -59,7 +60,7 @@ static void afs_remove_volume_from_cell(struct afs_volume *volume)
 	struct afs_cell *cell = volume->cell;
 
 	if (!hlist_unhashed(&volume->proc_link)) {
-		trace_afs_volume(volume->vid, refcount_read(&cell->ref),
+		trace_afs_volume(volume->debug_id, volume->vid, refcount_read(&volume->ref),
 				 afs_volume_trace_remove);
 		write_seqlock(&cell->volume_lock);
 		hlist_del_rcu(&volume->proc_link);
@@ -84,6 +85,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
 	if (!volume)
 		goto error_0;
 
+	volume->debug_id	= atomic_inc_return(&afs_volume_debug_id);
 	volume->vid		= vldb->vid[params->type];
 	volume->update_at	= ktime_get_real_seconds() + afs_volume_record_life;
 	volume->cell		= afs_get_cell(params->cell, afs_cell_trace_get_vol);
@@ -115,7 +117,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
 
 	*_slist = slist;
 	rcu_assign_pointer(volume->servers, slist);
-	trace_afs_volume(volume->vid, 1, afs_volume_trace_alloc);
+	trace_afs_volume(volume->debug_id, volume->vid, 1, afs_volume_trace_alloc);
 	return volume;
 
 error_1:
@@ -247,7 +249,7 @@ static void afs_destroy_volume(struct work_struct *work)
 	afs_remove_volume_from_cell(volume);
 	afs_put_serverlist(volume->cell->net, slist);
 	afs_put_cell(volume->cell, afs_cell_trace_put_vol);
-	trace_afs_volume(volume->vid, refcount_read(&volume->ref),
+	trace_afs_volume(volume->debug_id, volume->vid, refcount_read(&volume->ref),
 			 afs_volume_trace_free);
 	kfree_rcu(volume, rcu);
 
@@ -262,7 +264,7 @@ bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason)
 	int r;
 
 	if (__refcount_inc_not_zero(&volume->ref, &r)) {
-		trace_afs_volume(volume->vid, r + 1, reason);
+		trace_afs_volume(volume->debug_id, volume->vid, r + 1, reason);
 		return true;
 	}
 	return false;
@@ -278,7 +280,7 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume,
 		int r;
 
 		__refcount_inc(&volume->ref, &r);
-		trace_afs_volume(volume->vid, r + 1, reason);
+		trace_afs_volume(volume->debug_id, volume->vid, r + 1, reason);
 	}
 	return volume;
 }
@@ -290,12 +292,13 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume,
 void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason)
 {
 	if (volume) {
+		unsigned int debug_id = volume->debug_id;
 		afs_volid_t vid = volume->vid;
 		bool zero;
 		int r;
 
 		zero = __refcount_dec_and_test(&volume->ref, &r);
-		trace_afs_volume(vid, r - 1, reason);
+		trace_afs_volume(debug_id, vid, r - 1, reason);
 		if (zero)
 			schedule_work(&volume->destructor);
 	}
diff --git a/fs/afs/write.c b/fs/afs/write.c
index e959640694c2..93ad86ff3345 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -89,10 +89,12 @@ static const struct afs_operation_ops afs_store_data_operation = {
  */
 void afs_prepare_write(struct netfs_io_subrequest *subreq)
 {
+	struct netfs_io_stream *stream = &subreq->rreq->io_streams[subreq->stream_nr];
+
 	//if (test_bit(NETFS_SREQ_RETRYING, &subreq->flags))
 	//	subreq->max_len = 512 * 1024;
 	//else
-	subreq->max_len = 256 * 1024 * 1024;
+	stream->sreq_max_len = 256 * 1024 * 1024;
 }
 
 /*
@@ -118,17 +120,17 @@ static void afs_issue_write_worker(struct work_struct *work)
 
 #if 0 // Error injection
 	if (subreq->debug_index == 3)
-		return netfs_write_subrequest_terminated(subreq, -ENOANO, false);
+		return netfs_write_subrequest_terminated(subreq, -ENOANO);
 
-	if (!test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) {
+	if (!subreq->retry_count) {
 		set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
-		return netfs_write_subrequest_terminated(subreq, -EAGAIN, false);
+		return netfs_write_subrequest_terminated(subreq, -EAGAIN);
 	}
 #endif
 
 	op = afs_alloc_operation(wreq->netfs_priv, vnode->volume);
 	if (IS_ERR(op))
-		return netfs_write_subrequest_terminated(subreq, -EAGAIN, false);
+		return netfs_write_subrequest_terminated(subreq, -EAGAIN);
 
 	afs_op_set_vnode(op, 0, vnode);
 	op->file[0].dv_delta	= 1;
@@ -147,6 +149,9 @@ static void afs_issue_write_worker(struct work_struct *work)
 	afs_wait_for_operation(op);
 	ret = afs_put_operation(op);
 	switch (ret) {
+	case 0:
+		__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
+		break;
 	case -EACCES:
 	case -EPERM:
 	case -ENOKEY:
@@ -161,13 +166,13 @@ static void afs_issue_write_worker(struct work_struct *work)
 		break;
 	}
 
-	netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len, false);
+	netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len);
 }
 
 void afs_issue_write(struct netfs_io_subrequest *subreq)
 {
 	subreq->work.func = afs_issue_write_worker;
-	if (!queue_work(system_unbound_wq, &subreq->work))
+	if (!queue_work(system_dfl_wq, &subreq->work))
 		WARN_ON_ONCE(1);
 }
 
@@ -177,8 +182,8 @@ void afs_issue_write(struct netfs_io_subrequest *subreq)
  */
 void afs_begin_writeback(struct netfs_io_request *wreq)
 {
-	afs_get_writeback_key(wreq);
-	wreq->io_streams[0].avail = true;
+	if (S_ISREG(wreq->inode->i_mode))
+		afs_get_writeback_key(wreq);
 }
 
 /*
@@ -191,6 +196,19 @@ void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *st
 		list_first_entry(&stream->subrequests,
 				 struct netfs_io_subrequest, rreq_link);
 
+	switch (wreq->origin) {
+	case NETFS_READAHEAD:
+	case NETFS_READPAGE:
+	case NETFS_READ_GAPS:
+	case NETFS_READ_SINGLE:
+	case NETFS_READ_FOR_WRITE:
+	case NETFS_UNBUFFERED_READ:
+	case NETFS_DIO_READ:
+		return;
+	default:
+		break;
+	}
+
 	switch (subreq->error) {
 	case -EACCES:
 	case -EPERM:
diff --git a/fs/afs/xdr_fs.h b/fs/afs/xdr_fs.h
index 8ca868164507..cc5f143d21a3 100644
--- a/fs/afs/xdr_fs.h
+++ b/fs/afs/xdr_fs.h
@@ -88,7 +88,7 @@ union afs_xdr_dir_block {
 
 	struct {
 		struct afs_xdr_dir_hdr	hdr;
-		u8			alloc_ctrs[AFS_DIR_MAX_BLOCKS];
+		u8			alloc_ctrs[AFS_DIR_BLOCKS_WITH_CTR];
 		__be16			hashtable[AFS_DIR_HASHTBL_SIZE];
 	} meta;
 
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index f521e66d3bf6..febf13a49f0b 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -352,18 +352,19 @@ static int yfs_deliver_status_and_volsync(struct afs_call *call)
 static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
 {
 	struct afs_operation *op = call->op;
+	struct netfs_io_subrequest *subreq = op->fetch.subreq;
 	struct afs_vnode_param *vp = &op->file[0];
-	struct afs_read *req = op->fetch.req;
 	const __be32 *bp;
+	size_t count_before;
 	int ret;
 
 	_enter("{%u,%zu, %zu/%llu}",
 	       call->unmarshall, call->iov_len, iov_iter_count(call->iter),
-	       req->actual_len);
+	       call->remaining);
 
 	switch (call->unmarshall) {
 	case 0:
-		req->actual_len = 0;
+		call->remaining = 0;
 		afs_extract_to_tmp64(call);
 		call->unmarshall++;
 		fallthrough;
@@ -378,38 +379,39 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
 		if (ret < 0)
 			return ret;
 
-		req->actual_len = be64_to_cpu(call->tmp64);
-		_debug("DATA length: %llu", req->actual_len);
+		call->remaining = be64_to_cpu(call->tmp64);
+		_debug("DATA length: %llu", call->remaining);
 
-		if (req->actual_len == 0)
+		if (call->remaining == 0)
 			goto no_more_data;
 
-		call->iter = req->iter;
-		call->iov_len = min(req->actual_len, req->len);
+		call->iter = &subreq->io_iter;
+		call->iov_len = min(call->remaining, subreq->len - subreq->transferred);
 		call->unmarshall++;
 		fallthrough;
 
 		/* extract the returned data */
 	case 2:
-		_debug("extract data %zu/%llu",
-		       iov_iter_count(call->iter), req->actual_len);
+		count_before = call->iov_len;
+		_debug("extract data %zu/%llu", count_before, call->remaining);
 
 		ret = afs_extract_data(call, true);
+		subreq->transferred += count_before - call->iov_len;
 		if (ret < 0)
 			return ret;
 
 		call->iter = &call->def_iter;
-		if (req->actual_len <= req->len)
+		if (call->remaining)
 			goto no_more_data;
 
 		/* Discard any excess data the server gave us */
-		afs_extract_discard(call, req->actual_len - req->len);
+		afs_extract_discard(call, call->remaining);
 		call->unmarshall = 3;
 		fallthrough;
 
 	case 3:
 		_debug("extract discard %zu/%llu",
-		       iov_iter_count(call->iter), req->actual_len - req->len);
+		       iov_iter_count(call->iter), call->remaining);
 
 		ret = afs_extract_data(call, true);
 		if (ret < 0)
@@ -434,8 +436,8 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
 		xdr_decode_YFSCallBack(&bp, call, &vp->scb);
 		xdr_decode_YFSVolSync(&bp, &op->volsync);
 
-		req->data_version = vp->scb.status.data_version;
-		req->file_size = vp->scb.status.size;
+		if (subreq->start + subreq->transferred >= vp->scb.status.size)
+			__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
 
 		call->unmarshall++;
 		fallthrough;
@@ -454,7 +456,9 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
 static const struct afs_call_type yfs_RXYFSFetchData64 = {
 	.name		= "YFS.FetchData64",
 	.op		= yfs_FS_FetchData64,
+	.async_rx	= afs_fetch_data_async_rx,
 	.deliver	= yfs_deliver_fs_fetch_data64,
+	.immediate_cancel = afs_fetch_data_immediate_cancel,
 	.destructor	= afs_flat_call_destructor,
 };
 
@@ -463,14 +467,15 @@ static const struct afs_call_type yfs_RXYFSFetchData64 = {
  */
 void yfs_fs_fetch_data(struct afs_operation *op)
 {
+	struct netfs_io_subrequest *subreq = op->fetch.subreq;
 	struct afs_vnode_param *vp = &op->file[0];
-	struct afs_read *req = op->fetch.req;
 	struct afs_call *call;
 	__be32 *bp;
 
-	_enter(",%x,{%llx:%llu},%llx,%llx",
+	_enter(",%x,{%llx:%llu},%llx,%zx",
 	       key_serial(op->key), vp->fid.vid, vp->fid.vnode,
-	       req->pos, req->len);
+	       subreq->start + subreq->transferred,
+	       subreq->len   - subreq->transferred);
 
 	call = afs_alloc_flat_call(op->net, &yfs_RXYFSFetchData64,
 				   sizeof(__be32) * 2 +
@@ -482,15 +487,16 @@ void yfs_fs_fetch_data(struct afs_operation *op)
 	if (!call)
 		return afs_op_nomem(op);
 
-	req->call_debug_id = call->debug_id;
+	if (op->flags & AFS_OPERATION_ASYNC)
+		call->async = true;
 
 	/* marshall the parameters */
 	bp = call->request;
 	bp = xdr_encode_u32(bp, YFSFETCHDATA64);
 	bp = xdr_encode_u32(bp, 0); /* RPC flags */
 	bp = xdr_encode_YFSFid(bp, &vp->fid);
-	bp = xdr_encode_u64(bp, req->pos);
-	bp = xdr_encode_u64(bp, req->len);
+	bp = xdr_encode_u64(bp, subreq->start + subreq->transferred);
+	bp = xdr_encode_u64(bp, subreq->len   - subreq->transferred);
 	yfs_check_req(call, bp);
 
 	call->fid = vp->fid;
@@ -661,8 +667,9 @@ static int yfs_deliver_fs_remove_file2(struct afs_call *call)
 static void yfs_done_fs_remove_file2(struct afs_call *call)
 {
 	if (call->error == -ECONNABORTED &&
-	    call->abort_code == RX_INVALID_OPERATION) {
-		set_bit(AFS_SERVER_FL_NO_RM2, &call->server->flags);
+	    (call->abort_code == RX_INVALID_OPERATION ||
+	     call->abort_code == RXGEN_OPCODE)) {
+		set_bit(AFS_SERVER_FL_NO_RM2, &call->op->server->flags);
 		call->op->flags |= AFS_OPERATION_DOWNGRADE;
 	}
 }
@@ -1035,6 +1042,9 @@ void yfs_fs_rename(struct afs_operation *op)
 
 	_enter("");
 
+	if (!test_bit(AFS_SERVER_FL_NO_RENAME2, &op->server->flags))
+		return yfs_fs_rename_replace(op);
+
 	call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename,
 				   sizeof(__be32) +
 				   sizeof(struct yfs_xdr_RPCFlags) +
@@ -1064,6 +1074,252 @@ void yfs_fs_rename(struct afs_operation *op)
 }
 
 /*
+ * Deliver reply data to a YFS.Rename_NoReplace operation.  This does not
+ * return the status of a displaced target inode as there cannot be one.
+ */
+static int yfs_deliver_fs_rename_1(struct afs_call *call)
+{
+	struct afs_operation *op = call->op;
+	struct afs_vnode_param *orig_dvp = &op->file[0];
+	struct afs_vnode_param *new_dvp = &op->file[1];
+	struct afs_vnode_param *old_vp = &op->more_files[0];
+	const __be32 *bp;
+	int ret;
+
+	_enter("{%u}", call->unmarshall);
+
+	ret = afs_transfer_reply(call);
+	if (ret < 0)
+		return ret;
+
+	bp = call->buffer;
+	/* If the two dirs are the same, we have two copies of the same status
+	 * report, so we just decode it twice.
+	 */
+	xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb);
+	xdr_decode_YFSFid(&bp, &old_vp->fid);
+	xdr_decode_YFSFetchStatus(&bp, call, &old_vp->scb);
+	xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb);
+	xdr_decode_YFSVolSync(&bp, &op->volsync);
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * Deliver reply data to a YFS.Rename_Replace or a YFS.Rename_Exchange
+ * operation.  These return the status of the displaced target inode if there
+ * was one.
+ */
+static int yfs_deliver_fs_rename_2(struct afs_call *call)
+{
+	struct afs_operation *op = call->op;
+	struct afs_vnode_param *orig_dvp = &op->file[0];
+	struct afs_vnode_param *new_dvp = &op->file[1];
+	struct afs_vnode_param *old_vp = &op->more_files[0];
+	struct afs_vnode_param *new_vp = &op->more_files[1];
+	const __be32 *bp;
+	int ret;
+
+	_enter("{%u}", call->unmarshall);
+
+	ret = afs_transfer_reply(call);
+	if (ret < 0)
+		return ret;
+
+	bp = call->buffer;
+	/* If the two dirs are the same, we have two copies of the same status
+	 * report, so we just decode it twice.
+	 */
+	xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb);
+	xdr_decode_YFSFid(&bp, &old_vp->fid);
+	xdr_decode_YFSFetchStatus(&bp, call, &old_vp->scb);
+	xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb);
+	xdr_decode_YFSFid(&bp, &new_vp->fid);
+	xdr_decode_YFSFetchStatus(&bp, call, &new_vp->scb);
+	xdr_decode_YFSVolSync(&bp, &op->volsync);
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+static void yfs_done_fs_rename_replace(struct afs_call *call)
+{
+	if (call->error == -ECONNABORTED &&
+	    (call->abort_code == RX_INVALID_OPERATION ||
+	     call->abort_code == RXGEN_OPCODE)) {
+		set_bit(AFS_SERVER_FL_NO_RENAME2, &call->op->server->flags);
+		call->op->flags |= AFS_OPERATION_DOWNGRADE;
+	}
+}
+
+/*
+ * YFS.Rename_Replace operation type
+ */
+static const struct afs_call_type yfs_RXYFSRename_Replace = {
+	.name		= "FS.Rename_Replace",
+	.op		= yfs_FS_Rename_Replace,
+	.deliver	= yfs_deliver_fs_rename_2,
+	.done		= yfs_done_fs_rename_replace,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * YFS.Rename_NoReplace operation type
+ */
+static const struct afs_call_type yfs_RXYFSRename_NoReplace = {
+	.name		= "FS.Rename_NoReplace",
+	.op		= yfs_FS_Rename_NoReplace,
+	.deliver	= yfs_deliver_fs_rename_1,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * YFS.Rename_Exchange operation type
+ */
+static const struct afs_call_type yfs_RXYFSRename_Exchange = {
+	.name		= "FS.Rename_Exchange",
+	.op		= yfs_FS_Rename_Exchange,
+	.deliver	= yfs_deliver_fs_rename_2,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * Rename a file or directory, replacing the target if it exists.  The status
+ * of a displaced target is returned.
+ */
+void yfs_fs_rename_replace(struct afs_operation *op)
+{
+	struct afs_vnode_param *orig_dvp = &op->file[0];
+	struct afs_vnode_param *new_dvp = &op->file[1];
+	const struct qstr *orig_name = &op->dentry->d_name;
+	const struct qstr *new_name = &op->dentry_2->d_name;
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_Replace,
+				   sizeof(__be32) +
+				   sizeof(struct yfs_xdr_RPCFlags) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   xdr_strlen(orig_name->len) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   xdr_strlen(new_name->len),
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSVolSync));
+	if (!call)
+		return afs_op_nomem(op);
+
+	/* Marshall the parameters. */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSRENAME_REPLACE);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_YFSFid(bp, &orig_dvp->fid);
+	bp = xdr_encode_name(bp, orig_name);
+	bp = xdr_encode_YFSFid(bp, &new_dvp->fid);
+	bp = xdr_encode_name(bp, new_name);
+	yfs_check_req(call, bp);
+
+	call->fid = orig_dvp->fid;
+	trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name);
+	afs_make_op_call(op, call, GFP_NOFS);
+}
+
+/*
+ * Rename a file or directory, failing if the target dirent exists.
+ */
+void yfs_fs_rename_noreplace(struct afs_operation *op)
+{
+	struct afs_vnode_param *orig_dvp = &op->file[0];
+	struct afs_vnode_param *new_dvp = &op->file[1];
+	const struct qstr *orig_name = &op->dentry->d_name;
+	const struct qstr *new_name = &op->dentry_2->d_name;
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_NoReplace,
+				   sizeof(__be32) +
+				   sizeof(struct yfs_xdr_RPCFlags) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   xdr_strlen(orig_name->len) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   xdr_strlen(new_name->len),
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSVolSync));
+	if (!call)
+		return afs_op_nomem(op);
+
+	/* Marshall the parameters. */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSRENAME_NOREPLACE);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_YFSFid(bp, &orig_dvp->fid);
+	bp = xdr_encode_name(bp, orig_name);
+	bp = xdr_encode_YFSFid(bp, &new_dvp->fid);
+	bp = xdr_encode_name(bp, new_name);
+	yfs_check_req(call, bp);
+
+	call->fid = orig_dvp->fid;
+	trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name);
+	afs_make_op_call(op, call, GFP_NOFS);
+}
+
+/*
+ * Exchange a pair of files directories.
+ */
+void yfs_fs_rename_exchange(struct afs_operation *op)
+{
+	struct afs_vnode_param *orig_dvp = &op->file[0];
+	struct afs_vnode_param *new_dvp = &op->file[1];
+	const struct qstr *orig_name = &op->dentry->d_name;
+	const struct qstr *new_name = &op->dentry_2->d_name;
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_Exchange,
+				   sizeof(__be32) +
+				   sizeof(struct yfs_xdr_RPCFlags) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   xdr_strlen(orig_name->len) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   xdr_strlen(new_name->len),
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSVolSync));
+	if (!call)
+		return afs_op_nomem(op);
+
+	/* Marshall the parameters. */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSRENAME_EXCHANGE);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_YFSFid(bp, &orig_dvp->fid);
+	bp = xdr_encode_name(bp, orig_name);
+	bp = xdr_encode_YFSFid(bp, &new_dvp->fid);
+	bp = xdr_encode_name(bp, new_name);
+	yfs_check_req(call, bp);
+
+	call->fid = orig_dvp->fid;
+	trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name);
+	afs_make_op_call(op, call, GFP_NOFS);
+}
+
+/*
  * YFS.StoreData64 operation type.
  */
 static const struct afs_call_type yfs_RXYFSStoreData64 = {
diff --git a/fs/aio.c b/fs/aio.c
index 57c9f7c077e6..0a23a8c0717f 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -100,7 +100,7 @@ struct kioctx {
 
 	unsigned long		user_id;
 
-	struct __percpu kioctx_cpu *cpu;
+	struct kioctx_cpu __percpu *cpu;
 
 	/*
 	 * For percpu reqs_available, number of slots we move to/from global
@@ -224,7 +224,7 @@ static unsigned long aio_nr;		/* current system wide number of aio requests */
 static unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
 /*----end sysctl variables---*/
 #ifdef CONFIG_SYSCTL
-static struct ctl_table aio_sysctls[] = {
+static const struct ctl_table aio_sysctls[] = {
 	{
 		.procname	= "aio-nr",
 		.data		= &aio_nr,
@@ -392,15 +392,15 @@ static const struct vm_operations_struct aio_ring_vm_ops = {
 #endif
 };
 
-static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
+static int aio_ring_mmap_prepare(struct vm_area_desc *desc)
 {
-	vm_flags_set(vma, VM_DONTEXPAND);
-	vma->vm_ops = &aio_ring_vm_ops;
+	desc->vm_flags |= VM_DONTEXPAND;
+	desc->vm_ops = &aio_ring_vm_ops;
 	return 0;
 }
 
 static const struct file_operations aio_ring_fops = {
-	.mmap = aio_ring_mmap,
+	.mmap_prepare = aio_ring_mmap_prepare,
 };
 
 #if IS_ENABLED(CONFIG_MIGRATION)
@@ -410,17 +410,7 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst,
 	struct kioctx *ctx;
 	unsigned long flags;
 	pgoff_t idx;
-	int rc;
-
-	/*
-	 * We cannot support the _NO_COPY case here, because copy needs to
-	 * happen under the ctx->completion_lock. That does not work with the
-	 * migration workflow of MIGRATE_SYNC_NO_COPY.
-	 */
-	if (mode == MIGRATE_SYNC_NO_COPY)
-		return -EINVAL;
-
-	rc = 0;
+	int rc = 0;
 
 	/* mapping->i_private_lock here protects against the kioctx teardown.  */
 	spin_lock(&mapping->i_private_lock);
@@ -455,7 +445,7 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst,
 	folio_get(dst);
 
 	rc = folio_migrate_mapping(mapping, dst, src, 1);
-	if (rc != MIGRATEPAGE_SUCCESS) {
+	if (rc) {
 		folio_put(dst);
 		goto out_unlock;
 	}
@@ -465,7 +455,8 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst,
 	 * events from being lost.
 	 */
 	spin_lock_irqsave(&ctx->completion_lock, flags);
-	folio_migrate_copy(dst, src);
+	folio_copy(dst, src);
+	folio_migrate_flags(dst, src);
 	BUG_ON(ctx->ring_folios[idx] != src);
 	ctx->ring_folios[idx] = dst;
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
@@ -645,7 +636,7 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
 
 	/* Synchronize against RCU protected table->table[] dereferences */
 	INIT_RCU_WORK(&ctx->free_rwork, free_ioctx);
-	queue_rcu_work(system_wq, &ctx->free_rwork);
+	queue_rcu_work(system_percpu_wq, &ctx->free_rwork);
 }
 
 /*
@@ -1344,7 +1335,7 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
 	if (until == 0 || ret < 0 || ret >= min_nr)
 		return ret;
 
-	hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer_setup_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	if (until != KTIME_MAX) {
 		hrtimer_set_expires_range_ns(&t.timer, until, current->timer_slack_ns);
 		hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL);
@@ -1516,10 +1507,11 @@ static void aio_complete_rw(struct kiocb *kiocb, long res)
 	iocb_put(iocb);
 }
 
-static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
+static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int rw_type)
 {
 	int ret;
 
+	req->ki_write_stream = 0;
 	req->ki_complete = aio_complete_rw;
 	req->private = NULL;
 	req->ki_pos = iocb->aio_offset;
@@ -1542,7 +1534,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
 	} else
 		req->ki_ioprio = get_current_ioprio();
 
-	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
+	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags, rw_type);
 	if (unlikely(ret))
 		return ret;
 
@@ -1594,7 +1586,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
 	struct file *file;
 	int ret;
 
-	ret = aio_prep_rw(req, iocb);
+	ret = aio_prep_rw(req, iocb, READ);
 	if (ret)
 		return ret;
 	file = req->ki_filp;
@@ -1621,7 +1613,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
 	struct file *file;
 	int ret;
 
-	ret = aio_prep_rw(req, iocb);
+	ret = aio_prep_rw(req, iocb, WRITE);
 	if (ret)
 		return ret;
 	file = req->ki_filp;
@@ -1648,10 +1640,10 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
 static void aio_fsync_work(struct work_struct *work)
 {
 	struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work);
-	const struct cred *old_cred = override_creds(iocb->fsync.creds);
 
-	iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync);
-	revert_creds(old_cred);
+	scoped_with_creds(iocb->fsync.creds)
+		iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync);
+
 	put_cred(iocb->fsync.creds);
 	iocb_put(iocb);
 }
@@ -2200,7 +2192,6 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 		return -EINVAL;
 
 	spin_lock_irq(&ctx->ctx_lock);
-	/* TODO: use a hash or array, this sucks. */
 	list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
 		if (kiocb->ki_res.obj == obj) {
 			ret = kiocb->ki_cancel(&kiocb->rw);
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 42bd1cb7c9cd..b8381c7fb636 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -24,10 +24,51 @@
 
 #include <linux/uaccess.h>
 
+#include "internal.h"
+
 static struct vfsmount *anon_inode_mnt __ro_after_init;
 static struct inode *anon_inode_inode __ro_after_init;
 
 /*
+ * User space expects anonymous inodes to have no file type in st_mode.
+ *
+ * In particular, 'lsof' has this legacy logic:
+ *
+ *	type = s->st_mode & S_IFMT;
+ *	switch (type) {
+ *	  ...
+ *	case 0:
+ *		if (!strcmp(p, "anon_inode"))
+ *			Lf->ntype = Ntype = N_ANON_INODE;
+ *
+ * to detect our old anon_inode logic.
+ *
+ * Rather than mess with our internal sane inode data, just fix it
+ * up here in getattr() by masking off the format bits.
+ */
+int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
+		       struct kstat *stat, u32 request_mask,
+		       unsigned int query_flags)
+{
+	struct inode *inode = d_inode(path->dentry);
+
+	generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+	stat->mode &= ~S_IFMT;
+	return 0;
+}
+
+int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+		       struct iattr *attr)
+{
+	return -EOPNOTSUPP;
+}
+
+static const struct inode_operations anon_inode_operations = {
+	.getattr = anon_inode_getattr,
+	.setattr = anon_inode_setattr,
+};
+
+/*
  * anon_inodefs_dname() is called from d_path().
  */
 static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
@@ -45,6 +86,8 @@ static int anon_inodefs_init_fs_context(struct fs_context *fc)
 	struct pseudo_fs_context *ctx = init_pseudo(fc, ANON_INODE_FS_MAGIC);
 	if (!ctx)
 		return -ENOMEM;
+	fc->s_iflags |= SB_I_NOEXEC;
+	fc->s_iflags |= SB_I_NODEV;
 	ctx->dops = &anon_inodefs_dentry_operations;
 	return 0;
 }
@@ -55,25 +98,38 @@ static struct file_system_type anon_inode_fs_type = {
 	.kill_sb	= kill_anon_super,
 };
 
-static struct inode *anon_inode_make_secure_inode(
-	const char *name,
-	const struct inode *context_inode)
+/**
+ * anon_inode_make_secure_inode - allocate an anonymous inode with security context
+ * @sb:		[in]	Superblock to allocate from
+ * @name:	[in]	Name of the class of the newfile (e.g., "secretmem")
+ * @context_inode:
+ *		[in]	Optional parent inode for security inheritance
+ *
+ * The function ensures proper security initialization through the LSM hook
+ * security_inode_init_security_anon().
+ *
+ * Return:	Pointer to new inode on success, ERR_PTR on failure.
+ */
+struct inode *anon_inode_make_secure_inode(struct super_block *sb, const char *name,
+					   const struct inode *context_inode)
 {
 	struct inode *inode;
-	const struct qstr qname = QSTR_INIT(name, strlen(name));
 	int error;
 
-	inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
+	inode = alloc_anon_inode(sb);
 	if (IS_ERR(inode))
 		return inode;
 	inode->i_flags &= ~S_PRIVATE;
-	error =	security_inode_init_security_anon(inode, &qname, context_inode);
+	inode->i_op = &anon_inode_operations;
+	error =	security_inode_init_security_anon(inode, &QSTR(name),
+						  context_inode);
 	if (error) {
 		iput(inode);
 		return ERR_PTR(error);
 	}
 	return inode;
 }
+EXPORT_SYMBOL_FOR_MODULES(anon_inode_make_secure_inode, "kvm");
 
 static struct file *__anon_inode_getfile(const char *name,
 					 const struct file_operations *fops,
@@ -88,7 +144,8 @@ static struct file *__anon_inode_getfile(const char *name,
 		return ERR_PTR(-ENOENT);
 
 	if (make_inode) {
-		inode =	anon_inode_make_secure_inode(name, context_inode);
+		inode =	anon_inode_make_secure_inode(anon_inode_mnt->mnt_sb,
+						     name, context_inode);
 		if (IS_ERR(inode)) {
 			file = ERR_CAST(inode);
 			goto err;
@@ -223,27 +280,8 @@ static int __anon_inode_getfd(const char *name,
 			      const struct inode *context_inode,
 			      bool make_inode)
 {
-	int error, fd;
-	struct file *file;
-
-	error = get_unused_fd_flags(flags);
-	if (error < 0)
-		return error;
-	fd = error;
-
-	file = __anon_inode_getfile(name, fops, priv, flags, context_inode,
-				    make_inode);
-	if (IS_ERR(file)) {
-		error = PTR_ERR(file);
-		goto err_put_unused_fd;
-	}
-	fd_install(fd, file);
-
-	return fd;
-
-err_put_unused_fd:
-	put_unused_fd(fd);
-	return error;
+	return FD_ADD(flags, __anon_inode_getfile(name, fops, priv, flags,
+						  context_inode, make_inode));
 }
 
 /**
@@ -313,6 +351,7 @@ static int __init anon_inode_init(void)
 	anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
 	if (IS_ERR(anon_inode_inode))
 		panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));
+	anon_inode_inode->i_op = &anon_inode_operations;
 
 	return 0;
 }
diff --git a/fs/attr.c b/fs/attr.c
index 960a310581eb..b9ec6b47bab2 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -17,8 +17,6 @@
 #include <linux/filelock.h>
 #include <linux/security.h>
 
-#include "internal.h"
-
 /**
  * setattr_should_drop_sgid - determine whether the setgid bit needs to be
  *                            removed
@@ -232,7 +230,7 @@ EXPORT_SYMBOL(setattr_prepare);
  * @inode:	the inode to be truncated
  * @offset:	the new size to assign to the inode
  *
- * inode_newsize_ok must be called with i_mutex held.
+ * inode_newsize_ok must be called with i_rwsem held exclusively.
  *
  * inode_newsize_ok will check filesystem limits and ulimits to check that the
  * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
@@ -274,12 +272,45 @@ out_big:
 EXPORT_SYMBOL(inode_newsize_ok);
 
 /**
+ * setattr_copy_mgtime - update timestamps for mgtime inodes
+ * @inode: inode timestamps to be updated
+ * @attr: attrs for the update
+ *
+ * With multigrain timestamps, take more care to prevent races when
+ * updating the ctime. Always update the ctime to the very latest using
+ * the standard mechanism, and use that to populate the atime and mtime
+ * appropriately (unless those are being set to specific values).
+ */
+static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr)
+{
+	unsigned int ia_valid = attr->ia_valid;
+	struct timespec64 now;
+
+	if (ia_valid & ATTR_CTIME_SET)
+		now = inode_set_ctime_deleg(inode, attr->ia_ctime);
+	else if (ia_valid & ATTR_CTIME)
+		now = inode_set_ctime_current(inode);
+	else
+		now = current_time(inode);
+
+	if (ia_valid & ATTR_ATIME_SET)
+		inode_set_atime_to_ts(inode, attr->ia_atime);
+	else if (ia_valid & ATTR_ATIME)
+		inode_set_atime_to_ts(inode, now);
+
+	if (ia_valid & ATTR_MTIME_SET)
+		inode_set_mtime_to_ts(inode, attr->ia_mtime);
+	else if (ia_valid & ATTR_MTIME)
+		inode_set_mtime_to_ts(inode, now);
+}
+
+/**
  * setattr_copy - copy simple metadata updates into the generic inode
  * @idmap:	idmap of the mount the inode was found from
  * @inode:	the inode to be updated
  * @attr:	the new attributes
  *
- * setattr_copy must be called with i_mutex held.
+ * setattr_copy must be called with i_rwsem held exclusively.
  *
  * setattr_copy updates the inode's metadata with that specified
  * in attr on idmapped mounts. Necessary permission checks to determine
@@ -305,12 +336,6 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode,
 
 	i_uid_update(idmap, attr, inode);
 	i_gid_update(idmap, attr, inode);
-	if (ia_valid & ATTR_ATIME)
-		inode_set_atime_to_ts(inode, attr->ia_atime);
-	if (ia_valid & ATTR_MTIME)
-		inode_set_mtime_to_ts(inode, attr->ia_mtime);
-	if (ia_valid & ATTR_CTIME)
-		inode_set_ctime_to_ts(inode, attr->ia_ctime);
 	if (ia_valid & ATTR_MODE) {
 		umode_t mode = attr->ia_mode;
 		if (!in_group_or_capable(idmap, inode,
@@ -318,6 +343,19 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode,
 			mode &= ~S_ISGID;
 		inode->i_mode = mode;
 	}
+
+	if (is_mgtime(inode))
+		return setattr_copy_mgtime(inode, attr);
+
+	if (ia_valid & ATTR_ATIME)
+		inode_set_atime_to_ts(inode, attr->ia_atime);
+	if (ia_valid & ATTR_MTIME)
+		inode_set_mtime_to_ts(inode, attr->ia_mtime);
+
+	if (ia_valid & ATTR_CTIME_SET)
+		inode_set_ctime_deleg(inode, attr->ia_ctime);
+	else if (ia_valid & ATTR_CTIME)
+		inode_set_ctime_to_ts(inode, attr->ia_ctime);
 }
 EXPORT_SYMBOL(setattr_copy);
 
@@ -356,13 +394,13 @@ EXPORT_SYMBOL(may_setattr);
  * @attr:	new attributes
  * @delegated_inode: returns inode, if the inode is delegated
  *
- * The caller must hold the i_mutex on the affected object.
+ * The caller must hold the i_rwsem exclusively on the affected object.
  *
  * If notify_change discovers a delegation in need of breaking,
  * it will return -EWOULDBLOCK and return a reference to the inode in
  * delegated_inode.  The caller should then break the delegation and
  * retry.  Because breaking a delegation may take a long time, the
- * caller should drop the i_mutex before doing so.
+ * caller should drop the i_rwsem before doing so.
  *
  * Alternatively, a caller may pass NULL for delegated_inode.  This may
  * be appropriate for callers that expect the underlying filesystem not
@@ -377,7 +415,7 @@ EXPORT_SYMBOL(may_setattr);
  * performed on the raw inode simply pass @nop_mnt_idmap.
  */
 int notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
-		  struct iattr *attr, struct inode **delegated_inode)
+		  struct iattr *attr, struct delegated_inode *delegated_inode)
 {
 	struct inode *inode = dentry->d_inode;
 	umode_t mode = inode->i_mode;
@@ -409,22 +447,25 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
 		if (S_ISLNK(inode->i_mode))
 			return -EOPNOTSUPP;
 
-		/* Flag setting protected by i_mutex */
+		/* Flag setting protected by i_rwsem */
 		if (is_sxid(attr->ia_mode))
 			inode->i_flags &= ~S_NOSEC;
 	}
 
 	now = current_time(inode);
 
-	attr->ia_ctime = now;
-	if (!(ia_valid & ATTR_ATIME_SET))
-		attr->ia_atime = now;
-	else
+	if (ia_valid & ATTR_ATIME_SET)
 		attr->ia_atime = timestamp_truncate(attr->ia_atime, inode);
-	if (!(ia_valid & ATTR_MTIME_SET))
-		attr->ia_mtime = now;
 	else
+		attr->ia_atime = now;
+	if (ia_valid & ATTR_CTIME_SET)
+		attr->ia_ctime = timestamp_truncate(attr->ia_ctime, inode);
+	else
+		attr->ia_ctime = now;
+	if (ia_valid & ATTR_MTIME_SET)
 		attr->ia_mtime = timestamp_truncate(attr->ia_mtime, inode);
+	else
+		attr->ia_mtime = now;
 
 	if (ia_valid & ATTR_KILL_PRIV) {
 		error = security_inode_need_killpriv(dentry);
@@ -489,9 +530,17 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
 	error = security_inode_setattr(idmap, dentry, attr);
 	if (error)
 		return error;
-	error = try_break_deleg(inode, delegated_inode);
-	if (error)
-		return error;
+
+	/*
+	 * If ATTR_DELEG is set, then these attributes are being set on
+	 * behalf of the holder of a write delegation. We want to avoid
+	 * breaking the delegation in this case.
+	 */
+	if (!(ia_valid & ATTR_DELEG)) {
+		error = try_break_deleg(inode, delegated_inode);
+		if (error)
+			return error;
+	}
 
 	if (inode->i_op->setattr)
 		error = inode->i_op->setattr(idmap, dentry, attr);
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 8c1d587b3eef..4fd555528c5d 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -16,6 +16,7 @@
 #include <linux/wait.h>
 #include <linux/sched.h>
 #include <linux/sched/signal.h>
+#include <uapi/linux/mount.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/uaccess.h>
@@ -27,6 +28,9 @@
 #include <linux/magic.h>
 #include <linux/fs_context.h>
 #include <linux/fs_parser.h>
+#include "../mount.h"
+#include <linux/ns_common.h>
+
 
 /* This is the range of ioctl() numbers we claim as ours */
 #define AUTOFS_IOC_FIRST     AUTOFS_IOC_READY
@@ -62,6 +66,7 @@ struct autofs_info {
 	struct list_head expiring;
 
 	struct autofs_sb_info *sbi;
+	unsigned long exp_timeout;
 	unsigned long last_used;
 	int count;
 
@@ -81,6 +86,9 @@ struct autofs_info {
 					*/
 #define AUTOFS_INF_PENDING	(1<<2) /* dentry pending mount */
 
+#define AUTOFS_INF_EXPIRE_SET	(1<<3) /* per-dentry expire timeout set for
+					  this mount point.
+					*/
 struct autofs_wait_queue {
 	wait_queue_head_t queue;
 	struct autofs_wait_queue *next;
@@ -110,6 +118,7 @@ struct autofs_sb_info {
 	int pipefd;
 	struct file *pipe;
 	struct pid *oz_pgrp;
+	u64 mnt_ns_id;
 	int version;
 	int sub_version;
 	int min_proto;
@@ -214,6 +223,8 @@ void autofs_clean_ino(struct autofs_info *);
 
 static inline int autofs_check_pipe(struct file *pipe)
 {
+	if (pipe->f_mode & FMODE_PATH)
+		return -EINVAL;
 	if (!(pipe->f_mode & FMODE_CAN_WRITE))
 		return -EINVAL;
 	if (!S_ISFIFO(file_inode(pipe)->i_mode))
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index 5bf781ea6d67..6743b3b64217 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -110,6 +110,7 @@ static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
  */
 static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
 {
+	unsigned int inr = _IOC_NR(cmd);
 	int err;
 
 	err = check_dev_ioctl_version(cmd, param);
@@ -128,15 +129,19 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
 			goto out;
 		}
 
+		/* Setting the per-dentry expire timeout requires a trailing
+		 * path component, ie. no '/', so invert the logic of the
+		 * check_name() return for AUTOFS_DEV_IOCTL_TIMEOUT_CMD.
+		 */
 		err = check_name(param->path);
+		if (inr == AUTOFS_DEV_IOCTL_TIMEOUT_CMD)
+			err = err ? 0 : -EINVAL;
 		if (err) {
 			pr_warn("invalid path supplied for cmd(0x%08x)\n",
 				cmd);
 			goto out;
 		}
 	} else {
-		unsigned int inr = _IOC_NR(cmd);
-
 		if (inr == AUTOFS_DEV_IOCTL_OPENMOUNT_CMD ||
 		    inr == AUTOFS_DEV_IOCTL_REQUESTER_CMD ||
 		    inr == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD) {
@@ -226,32 +231,14 @@ static int test_by_type(const struct path *path, void *p)
  */
 static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)
 {
-	int err, fd;
-
-	fd = get_unused_fd_flags(O_CLOEXEC);
-	if (likely(fd >= 0)) {
-		struct file *filp;
-		struct path path;
-
-		err = find_autofs_mount(name, &path, test_by_dev, &devid);
-		if (err)
-			goto out;
-
-		filp = dentry_open(&path, O_RDONLY, current_cred());
-		path_put(&path);
-		if (IS_ERR(filp)) {
-			err = PTR_ERR(filp);
-			goto out;
-		}
-
-		fd_install(fd, filp);
-	}
+	struct path path __free(path_put) = {};
+	int err;
 
-	return fd;
+	err = find_autofs_mount(name, &path, test_by_dev, &devid);
+	if (err)
+		return err;
 
-out:
-	put_unused_fd(fd);
-	return err;
+	return FD_ADD(O_CLOEXEC, dentry_open(&path, O_RDONLY, current_cred()));
 }
 
 /* Open a file descriptor on an autofs mount point */
@@ -376,6 +363,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
 		swap(sbi->oz_pgrp, new_pid);
 		sbi->pipefd = pipefd;
 		sbi->pipe = pipe;
+		sbi->mnt_ns_id = to_ns_common(current->nsproxy->mnt_ns)->ns_id;
 		sbi->flags &= ~AUTOFS_SBI_CATATONIC;
 	}
 out:
@@ -396,16 +384,97 @@ static int autofs_dev_ioctl_catatonic(struct file *fp,
 	return 0;
 }
 
-/* Set the autofs mount timeout */
+/*
+ * Set the autofs mount expire timeout.
+ *
+ * There are two places an expire timeout can be set, in the autofs
+ * super block info. (this is all that's needed for direct and offset
+ * mounts because there's a distinct mount corresponding to each of
+ * these) and per-dentry within within the dentry info. If a per-dentry
+ * timeout is set it will override the expire timeout set in the parent
+ * autofs super block info.
+ *
+ * If setting the autofs super block expire timeout the autofs_dev_ioctl
+ * size field will be equal to the autofs_dev_ioctl structure size. If
+ * setting the per-dentry expire timeout the mount point name is passed
+ * in the autofs_dev_ioctl path field and the size field updated to
+ * reflect this.
+ *
+ * Setting the autofs mount expire timeout sets the timeout in the super
+ * block info. struct. Setting the per-dentry timeout does a little more.
+ * If the timeout is equal to -1 the per-dentry timeout (and flag) is
+ * cleared which reverts to using the super block timeout, otherwise if
+ * timeout is 0 the timeout is set to this value and the flag is left
+ * set which disables expiration for the mount point, lastly the flag
+ * and the timeout are set enabling the dentry to use this timeout.
+ */
 static int autofs_dev_ioctl_timeout(struct file *fp,
 				    struct autofs_sb_info *sbi,
 				    struct autofs_dev_ioctl *param)
 {
-	unsigned long timeout;
+	unsigned long timeout = param->timeout.timeout;
+
+	/* If setting the expire timeout for an individual indirect
+	 * mount point dentry the mount trailing component path is
+	 * placed in param->path and param->size adjusted to account
+	 * for it otherwise param->size it is set to the structure
+	 * size.
+	 */
+	if (param->size == AUTOFS_DEV_IOCTL_SIZE) {
+		param->timeout.timeout = sbi->exp_timeout / HZ;
+		sbi->exp_timeout = timeout * HZ;
+	} else {
+		struct dentry *base = fp->f_path.dentry;
+		int path_len = param->size - AUTOFS_DEV_IOCTL_SIZE - 1;
+		struct dentry *dentry;
+		struct autofs_info *ino;
+
+		if (!autofs_type_indirect(sbi->type))
+			return -EINVAL;
+
+		dentry = try_lookup_noperm(&QSTR_LEN(param->path, path_len),
+					   base);
+		if (IS_ERR_OR_NULL(dentry))
+			return dentry ? PTR_ERR(dentry) : -ENOENT;
+		ino = autofs_dentry_ino(dentry);
+		if (!ino) {
+			dput(dentry);
+			return -ENOENT;
+		}
+
+		if (ino->exp_timeout && ino->flags & AUTOFS_INF_EXPIRE_SET)
+			param->timeout.timeout = ino->exp_timeout / HZ;
+		else
+			param->timeout.timeout = sbi->exp_timeout / HZ;
+
+		if (timeout == -1) {
+			/* Revert to using the super block timeout */
+			ino->flags &= ~AUTOFS_INF_EXPIRE_SET;
+			ino->exp_timeout = 0;
+		} else {
+			/* Set the dentry expire flag and timeout.
+			 *
+			 * If timeout is 0 it will prevent the expire
+			 * of this particular automount.
+			 */
+			ino->flags |= AUTOFS_INF_EXPIRE_SET;
+			ino->exp_timeout = timeout * HZ;
+		}
+
+		/* An expire timeout greater than the superblock timeout
+		 * could be a problem at shutdown but the super block
+		 * timeout itself can change so all we can really do is
+		 * warn the user.
+		 */
+		if (ino->flags & AUTOFS_INF_EXPIRE_SET &&
+		    ino->exp_timeout > sbi->exp_timeout)
+			pr_warn("per-mount expire timeout is greater than "
+				"the parent autofs mount timeout which could "
+				"prevent shutdown\n");
+
+		dput(dentry);
+	}
 
-	timeout = param->timeout.timeout;
-	param->timeout.timeout = sbi->exp_timeout / HZ;
-	sbi->exp_timeout = timeout * HZ;
 	return 0;
 }
 
diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index 39d8c84c16f4..5c2d459e1e48 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -429,8 +429,6 @@ static struct dentry *autofs_expire_indirect(struct super_block *sb,
 	if (!root)
 		return NULL;
 
-	timeout = sbi->exp_timeout;
-
 	dentry = NULL;
 	while ((dentry = get_next_positive_subdir(dentry, root))) {
 		spin_lock(&sbi->fs_lock);
@@ -441,6 +439,11 @@ static struct dentry *autofs_expire_indirect(struct super_block *sb,
 		}
 		spin_unlock(&sbi->fs_lock);
 
+		if (ino->flags & AUTOFS_INF_EXPIRE_SET)
+			timeout = ino->exp_timeout;
+		else
+			timeout = sbi->exp_timeout;
+
 		expired = should_expire(dentry, mnt, timeout, how);
 		if (!expired)
 			continue;
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
index b5e4dfa04ed0..1d644a35ffa0 100644
--- a/fs/autofs/init.c
+++ b/fs/autofs/init.c
@@ -38,4 +38,5 @@ static void __exit exit_autofs_fs(void)
 
 module_init(init_autofs_fs)
 module_exit(exit_autofs_fs)
+MODULE_DESCRIPTION("Kernel automounter support");
 MODULE_LICENSE("GPL");
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 1f5db6863663..b932b1719dfc 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -19,6 +19,7 @@ struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi)
 		INIT_LIST_HEAD(&ino->expiring);
 		ino->last_used = jiffies;
 		ino->sbi = sbi;
+		ino->exp_timeout = -1;
 		ino->count = 1;
 	}
 	return ino;
@@ -28,6 +29,7 @@ void autofs_clean_ino(struct autofs_info *ino)
 {
 	ino->uid = GLOBAL_ROOT_UID;
 	ino->gid = GLOBAL_ROOT_GID;
+	ino->exp_timeout = -1;
 	ino->last_used = jiffies;
 }
 
@@ -53,7 +55,7 @@ void autofs_kill_sb(struct super_block *sb)
 	}
 
 	pr_debug("shutting down\n");
-	kill_litter_super(sb);
+	kill_anon_super(sb);
 	if (sbi)
 		kfree_rcu(sbi, rcu);
 }
@@ -126,7 +128,7 @@ enum {
 const struct fs_parameter_spec autofs_param_specs[] = {
 	fsparam_flag	("direct",		Opt_direct),
 	fsparam_fd	("fd",			Opt_fd),
-	fsparam_u32	("gid",			Opt_gid),
+	fsparam_gid	("gid",			Opt_gid),
 	fsparam_flag	("ignore",		Opt_ignore),
 	fsparam_flag	("indirect",		Opt_indirect),
 	fsparam_u32	("maxproto",		Opt_maxproto),
@@ -134,7 +136,7 @@ const struct fs_parameter_spec autofs_param_specs[] = {
 	fsparam_flag	("offset",		Opt_offset),
 	fsparam_u32	("pgrp",		Opt_pgrp),
 	fsparam_flag	("strictexpire",	Opt_strictexpire),
-	fsparam_u32	("uid",			Opt_uid),
+	fsparam_uid	("uid",			Opt_uid),
 	{}
 };
 
@@ -172,8 +174,7 @@ static int autofs_parse_fd(struct fs_context *fc, struct autofs_sb_info *sbi,
 	ret = autofs_check_pipe(pipe);
 	if (ret < 0) {
 		errorf(fc, "Invalid/unusable pipe");
-		if (param->type != fs_value_is_file)
-			fput(pipe);
+		fput(pipe);
 		return -EBADF;
 	}
 
@@ -193,8 +194,6 @@ static int autofs_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	struct autofs_fs_context *ctx = fc->fs_private;
 	struct autofs_sb_info *sbi = fc->s_fs_info;
 	struct fs_parse_result result;
-	kuid_t uid;
-	kgid_t gid;
 	int opt;
 
 	opt = fs_parse(fc, autofs_param_specs, param, &result);
@@ -205,16 +204,10 @@ static int autofs_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	case Opt_fd:
 		return autofs_parse_fd(fc, sbi, param, &result);
 	case Opt_uid:
-		uid = make_kuid(current_user_ns(), result.uint_32);
-		if (!uid_valid(uid))
-			return invalfc(fc, "Invalid uid");
-		ctx->uid = uid;
+		ctx->uid = result.uid;
 		break;
 	case Opt_gid:
-		gid = make_kgid(current_user_ns(), result.uint_32);
-		if (!gid_valid(gid))
-			return invalfc(fc, "Invalid gid");
-		ctx->gid = gid;
+		ctx->gid = result.gid;
 		break;
 	case Opt_pgrp:
 		ctx->pgrp = result.uint_32;
@@ -258,6 +251,7 @@ static struct autofs_sb_info *autofs_alloc_sbi(void)
 	sbi->min_proto = AUTOFS_MIN_PROTO_VERSION;
 	sbi->max_proto = AUTOFS_MAX_PROTO_VERSION;
 	sbi->pipefd = -1;
+	sbi->mnt_ns_id = to_ns_common(current->nsproxy->mnt_ns)->ns_id;
 
 	set_autofs_type_indirect(&sbi->type);
 	mutex_init(&sbi->wq_mutex);
@@ -318,7 +312,7 @@ static int autofs_fill_super(struct super_block *s, struct fs_context *fc)
 	s->s_blocksize_bits = 10;
 	s->s_magic = AUTOFS_SUPER_MAGIC;
 	s->s_op = &autofs_sops;
-	s->s_d_op = &autofs_dentry_operations;
+	set_default_d_op(s, &autofs_dentry_operations);
 	s->s_time_gran = 1;
 
 	/*
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 530d18827e35..2c31002b314a 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -15,8 +15,8 @@ static int autofs_dir_symlink(struct mnt_idmap *, struct inode *,
 			      struct dentry *, const char *);
 static int autofs_dir_unlink(struct inode *, struct dentry *);
 static int autofs_dir_rmdir(struct inode *, struct dentry *);
-static int autofs_dir_mkdir(struct mnt_idmap *, struct inode *,
-			    struct dentry *, umode_t);
+static struct dentry *autofs_dir_mkdir(struct mnt_idmap *, struct inode *,
+				       struct dentry *, umode_t);
 static long autofs_root_ioctl(struct file *, unsigned int, unsigned long);
 #ifdef CONFIG_COMPAT
 static long autofs_root_compat_ioctl(struct file *,
@@ -341,6 +341,14 @@ static struct vfsmount *autofs_d_automount(struct path *path)
 	if (autofs_oz_mode(sbi))
 		return NULL;
 
+	/* Refuse to trigger mount if current namespace is not the owner
+	 * and the mount is propagation private.
+	 */
+	if (sbi->mnt_ns_id != to_ns_common(current->nsproxy->mnt_ns)->ns_id) {
+		if (vfsmount_to_propagation_flags(path->mnt) & MS_PRIVATE)
+			return ERR_PTR(-EPERM);
+	}
+
 	/*
 	 * If an expire request is pending everyone must wait.
 	 * If the expire fails we're still mounted so continue
@@ -594,9 +602,8 @@ static int autofs_dir_symlink(struct mnt_idmap *idmap,
 	}
 	inode->i_private = cp;
 	inode->i_size = size;
-	d_add(dentry, inode);
 
-	dget(dentry);
+	d_make_persistent(dentry, inode);
 	p_ino = autofs_dentry_ino(dentry->d_parent);
 	p_ino->count++;
 
@@ -623,12 +630,11 @@ static int autofs_dir_symlink(struct mnt_idmap *idmap,
 static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
-	struct autofs_info *ino = autofs_dentry_ino(dentry);
 	struct autofs_info *p_ino;
 
 	p_ino = autofs_dentry_ino(dentry->d_parent);
 	p_ino->count--;
-	dput(ino->dentry);
+	d_make_discardable(dentry);
 
 	d_inode(dentry)->i_size = 0;
 	clear_nlink(d_inode(dentry));
@@ -710,7 +716,7 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry)
 
 	p_ino = autofs_dentry_ino(dentry->d_parent);
 	p_ino->count--;
-	dput(ino->dentry);
+	d_make_discardable(dentry);
 	d_inode(dentry)->i_size = 0;
 	clear_nlink(d_inode(dentry));
 
@@ -720,9 +726,9 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry)
 	return 0;
 }
 
-static int autofs_dir_mkdir(struct mnt_idmap *idmap,
-			    struct inode *dir, struct dentry *dentry,
-			    umode_t mode)
+static struct dentry *autofs_dir_mkdir(struct mnt_idmap *idmap,
+				       struct inode *dir, struct dentry *dentry,
+				       umode_t mode)
 {
 	struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
 	struct autofs_info *ino = autofs_dentry_ino(dentry);
@@ -739,19 +745,18 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap,
 
 	inode = autofs_get_inode(dir->i_sb, S_IFDIR | mode);
 	if (!inode)
-		return -ENOMEM;
-	d_add(dentry, inode);
+		return ERR_PTR(-ENOMEM);
 
 	if (sbi->version < 5)
 		autofs_set_leaf_automount_flags(dentry);
 
-	dget(dentry);
+	d_make_persistent(dentry, inode);
 	p_ino = autofs_dentry_ino(dentry->d_parent);
 	p_ino->count++;
 	inc_nlink(dir);
 	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 
-	return 0;
+	return NULL;
 }
 
 /* Get/set timeout ioctl() operation */
diff --git a/fs/backing-file.c b/fs/backing-file.c
index afb557446c27..45da8600d564 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -41,7 +41,7 @@ struct file *backing_file_open(const struct path *user_path, int flags,
 		return f;
 
 	path_get(user_path);
-	*backing_file_user_path(f) = *user_path;
+	backing_file_set_user_path(f, user_path);
 	error = vfs_open(real_path, f);
 	if (error) {
 		fput(f);
@@ -65,7 +65,7 @@ struct file *backing_tmpfile_open(const struct path *user_path, int flags,
 		return f;
 
 	path_get(user_path);
-	*backing_file_user_path(f) = *user_path;
+	backing_file_set_user_path(f, user_path);
 	error = vfs_tmpfile(real_idmap, real_parentpath, f, mode);
 	if (error) {
 		fput(f);
@@ -80,7 +80,7 @@ struct backing_aio {
 	refcount_t ref;
 	struct kiocb *orig_iocb;
 	/* used for aio completion */
-	void (*end_write)(struct file *);
+	void (*end_write)(struct kiocb *iocb, ssize_t);
 	struct work_struct work;
 	long res;
 };
@@ -108,10 +108,10 @@ static void backing_aio_cleanup(struct backing_aio *aio, long res)
 	struct kiocb *iocb = &aio->iocb;
 	struct kiocb *orig_iocb = aio->orig_iocb;
 
+	orig_iocb->ki_pos = iocb->ki_pos;
 	if (aio->end_write)
-		aio->end_write(orig_iocb->ki_filp);
+		aio->end_write(orig_iocb, res);
 
-	orig_iocb->ki_pos = iocb->ki_pos;
 	backing_aio_put(aio);
 }
 
@@ -157,13 +157,37 @@ static int backing_aio_init_wq(struct kiocb *iocb)
 	return sb_init_dio_done_wq(sb);
 }
 
+static int do_backing_file_read_iter(struct file *file, struct iov_iter *iter,
+				     struct kiocb *iocb, int flags)
+{
+	struct backing_aio *aio = NULL;
+	int ret;
+
+	if (is_sync_kiocb(iocb)) {
+		rwf_t rwf = iocb_to_rw_flags(flags);
+
+		return vfs_iter_read(file, iter, &iocb->ki_pos, rwf);
+	}
+
+	aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
+	if (!aio)
+		return -ENOMEM;
+
+	aio->orig_iocb = iocb;
+	kiocb_clone(&aio->iocb, iocb, get_file(file));
+	aio->iocb.ki_complete = backing_aio_rw_complete;
+	refcount_set(&aio->ref, 2);
+	ret = vfs_iocb_iter_read(file, &aio->iocb, iter);
+	backing_aio_put(aio);
+	if (ret != -EIOCBQUEUED)
+		backing_aio_cleanup(aio, ret);
+	return ret;
+}
 
 ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
 			       struct kiocb *iocb, int flags,
 			       struct backing_file_ctx *ctx)
 {
-	struct backing_aio *aio = NULL;
-	const struct cred *old_cred;
 	ssize_t ret;
 
 	if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
@@ -176,41 +200,57 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
 	    !(file->f_mode & FMODE_CAN_ODIRECT))
 		return -EINVAL;
 
-	old_cred = override_creds(ctx->cred);
+	scoped_with_creds(ctx->cred)
+		ret = do_backing_file_read_iter(file, iter, iocb, flags);
+
+	if (ctx->accessed)
+		ctx->accessed(iocb->ki_filp);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(backing_file_read_iter);
+
+static int do_backing_file_write_iter(struct file *file, struct iov_iter *iter,
+				      struct kiocb *iocb, int flags,
+				      void (*end_write)(struct kiocb *, ssize_t))
+{
+	struct backing_aio *aio;
+	int ret;
+
 	if (is_sync_kiocb(iocb)) {
 		rwf_t rwf = iocb_to_rw_flags(flags);
 
-		ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf);
-	} else {
-		ret = -ENOMEM;
-		aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
-		if (!aio)
-			goto out;
-
-		aio->orig_iocb = iocb;
-		kiocb_clone(&aio->iocb, iocb, get_file(file));
-		aio->iocb.ki_complete = backing_aio_rw_complete;
-		refcount_set(&aio->ref, 2);
-		ret = vfs_iocb_iter_read(file, &aio->iocb, iter);
-		backing_aio_put(aio);
-		if (ret != -EIOCBQUEUED)
-			backing_aio_cleanup(aio, ret);
+		ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf);
+		if (end_write)
+			end_write(iocb, ret);
+		return ret;
 	}
-out:
-	revert_creds(old_cred);
 
-	if (ctx->accessed)
-		ctx->accessed(ctx->user_file);
+	ret = backing_aio_init_wq(iocb);
+	if (ret)
+		return ret;
 
+	aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
+	if (!aio)
+		return -ENOMEM;
+
+	aio->orig_iocb = iocb;
+	aio->end_write = end_write;
+	kiocb_clone(&aio->iocb, iocb, get_file(file));
+	aio->iocb.ki_flags = flags;
+	aio->iocb.ki_complete = backing_aio_queue_completion;
+	refcount_set(&aio->ref, 2);
+	ret = vfs_iocb_iter_write(file, &aio->iocb, iter);
+	backing_aio_put(aio);
+	if (ret != -EIOCBQUEUED)
+		backing_aio_cleanup(aio, ret);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(backing_file_read_iter);
 
 ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
 				struct kiocb *iocb, int flags,
 				struct backing_file_ctx *ctx)
 {
-	const struct cred *old_cred;
 	ssize_t ret;
 
 	if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
@@ -219,7 +259,7 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
 	if (!iov_iter_count(iter))
 		return 0;
 
-	ret = file_remove_privs(ctx->user_file);
+	ret = file_remove_privs(iocb->ki_filp);
 	if (ret)
 		return ret;
 
@@ -227,94 +267,56 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
 	    !(file->f_mode & FMODE_CAN_ODIRECT))
 		return -EINVAL;
 
-	/*
-	 * Stacked filesystems don't support deferred completions, don't copy
-	 * this property in case it is set by the issuer.
-	 */
-	flags &= ~IOCB_DIO_CALLER_COMP;
-
-	old_cred = override_creds(ctx->cred);
-	if (is_sync_kiocb(iocb)) {
-		rwf_t rwf = iocb_to_rw_flags(flags);
-
-		ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf);
-		if (ctx->end_write)
-			ctx->end_write(ctx->user_file);
-	} else {
-		struct backing_aio *aio;
-
-		ret = backing_aio_init_wq(iocb);
-		if (ret)
-			goto out;
-
-		ret = -ENOMEM;
-		aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
-		if (!aio)
-			goto out;
-
-		aio->orig_iocb = iocb;
-		aio->end_write = ctx->end_write;
-		kiocb_clone(&aio->iocb, iocb, get_file(file));
-		aio->iocb.ki_flags = flags;
-		aio->iocb.ki_complete = backing_aio_queue_completion;
-		refcount_set(&aio->ref, 2);
-		ret = vfs_iocb_iter_write(file, &aio->iocb, iter);
-		backing_aio_put(aio);
-		if (ret != -EIOCBQUEUED)
-			backing_aio_cleanup(aio, ret);
-	}
-out:
-	revert_creds(old_cred);
-
-	return ret;
+	scoped_with_creds(ctx->cred)
+		return do_backing_file_write_iter(file, iter, iocb, flags, ctx->end_write);
 }
 EXPORT_SYMBOL_GPL(backing_file_write_iter);
 
-ssize_t backing_file_splice_read(struct file *in, loff_t *ppos,
+ssize_t backing_file_splice_read(struct file *in, struct kiocb *iocb,
 				 struct pipe_inode_info *pipe, size_t len,
 				 unsigned int flags,
 				 struct backing_file_ctx *ctx)
 {
-	const struct cred *old_cred;
 	ssize_t ret;
 
 	if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING)))
 		return -EIO;
 
-	old_cred = override_creds(ctx->cred);
-	ret = vfs_splice_read(in, ppos, pipe, len, flags);
-	revert_creds(old_cred);
+	scoped_with_creds(ctx->cred)
+		ret = vfs_splice_read(in, &iocb->ki_pos, pipe, len, flags);
 
 	if (ctx->accessed)
-		ctx->accessed(ctx->user_file);
+		ctx->accessed(iocb->ki_filp);
 
 	return ret;
 }
 EXPORT_SYMBOL_GPL(backing_file_splice_read);
 
 ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
-				  struct file *out, loff_t *ppos, size_t len,
-				  unsigned int flags,
+				  struct file *out, struct kiocb *iocb,
+				  size_t len, unsigned int flags,
 				  struct backing_file_ctx *ctx)
 {
-	const struct cred *old_cred;
 	ssize_t ret;
 
 	if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING)))
 		return -EIO;
 
-	ret = file_remove_privs(ctx->user_file);
+	if (!out->f_op->splice_write)
+		return -EINVAL;
+
+	ret = file_remove_privs(iocb->ki_filp);
 	if (ret)
 		return ret;
 
-	old_cred = override_creds(ctx->cred);
-	file_start_write(out);
-	ret = iter_file_splice_write(pipe, out, ppos, len, flags);
-	file_end_write(out);
-	revert_creds(old_cred);
+	scoped_with_creds(ctx->cred) {
+		file_start_write(out);
+		ret = out->f_op->splice_write(pipe, out, &iocb->ki_pos, len, flags);
+		file_end_write(out);
+	}
 
 	if (ctx->end_write)
-		ctx->end_write(ctx->user_file);
+		ctx->end_write(iocb, ret);
 
 	return ret;
 }
@@ -323,24 +325,22 @@ EXPORT_SYMBOL_GPL(backing_file_splice_write);
 int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
 		      struct backing_file_ctx *ctx)
 {
-	const struct cred *old_cred;
+	struct file *user_file = vma->vm_file;
 	int ret;
 
-	if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)) ||
-	    WARN_ON_ONCE(ctx->user_file != vma->vm_file))
+	if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
 		return -EIO;
 
-	if (!file->f_op->mmap)
+	if (!can_mmap_file(file))
 		return -ENODEV;
 
 	vma_set_file(vma, file);
 
-	old_cred = override_creds(ctx->cred);
-	ret = call_mmap(vma->vm_file, vma);
-	revert_creds(old_cred);
+	scoped_with_creds(ctx->cred)
+		ret = vfs_mmap(vma->vm_file, vma);
 
 	if (ctx->accessed)
-		ctx->accessed(ctx->user_file);
+		ctx->accessed(user_file);
 
 	return ret;
 }
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 316d88da2ce1..0ef9bcb744dd 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -58,10 +58,10 @@ static int bad_inode_symlink(struct mnt_idmap *idmap,
 	return -EIO;
 }
 
-static int bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir,
-			   struct dentry *dentry, umode_t mode)
+static struct dentry *bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+				      struct dentry *dentry, umode_t mode)
 {
-	return -EIO;
+	return ERR_PTR(-EIO);
 }
 
 static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry)
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
deleted file mode 100644
index 5cdfef3b551a..000000000000
--- a/fs/bcachefs/Kconfig
+++ /dev/null
@@ -1,97 +0,0 @@
-
-config BCACHEFS_FS
-	tristate "bcachefs filesystem support (EXPERIMENTAL)"
-	depends on BLOCK
-	select EXPORTFS
-	select CLOSURES
-	select LIBCRC32C
-	select CRC64
-	select FS_POSIX_ACL
-	select LZ4_COMPRESS
-	select LZ4_DECOMPRESS
-	select LZ4HC_COMPRESS
-	select LZ4HC_DECOMPRESS
-	select ZLIB_DEFLATE
-	select ZLIB_INFLATE
-	select ZSTD_COMPRESS
-	select ZSTD_DECOMPRESS
-	select CRYPTO_SHA256
-	select CRYPTO_CHACHA20
-	select CRYPTO_POLY1305
-	select KEYS
-	select RAID6_PQ
-	select XOR_BLOCKS
-	select XXHASH
-	select SRCU
-	select SYMBOLIC_ERRNAME
-	help
-	The bcachefs filesystem - a modern, copy on write filesystem, with
-	support for multiple devices, compression, checksumming, etc.
-
-config BCACHEFS_QUOTA
-	bool "bcachefs quota support"
-	depends on BCACHEFS_FS
-	select QUOTACTL
-
-config BCACHEFS_ERASURE_CODING
-	bool "bcachefs erasure coding (RAID5/6) support (EXPERIMENTAL)"
-	depends on BCACHEFS_FS
-	select QUOTACTL
-	help
-	This enables the "erasure_code" filesysystem and inode option, which
-	organizes data into reed-solomon stripes instead of ordinary
-	replication.
-
-	WARNING: this feature is still undergoing on disk format changes, and
-	should only be enabled for testing purposes.
-
-config BCACHEFS_POSIX_ACL
-	bool "bcachefs POSIX ACL support"
-	depends on BCACHEFS_FS
-	select FS_POSIX_ACL
-
-config BCACHEFS_DEBUG
-	bool "bcachefs debugging"
-	depends on BCACHEFS_FS
-	help
-	Enables many extra debugging checks and assertions.
-
-	The resulting code will be significantly slower than normal; you
-	probably shouldn't select this option unless you're a developer.
-
-config BCACHEFS_TESTS
-	bool "bcachefs unit and performance tests"
-	depends on BCACHEFS_FS
-	help
-	Include some unit and performance tests for the core btree code
-
-config BCACHEFS_LOCK_TIME_STATS
-       bool "bcachefs lock time statistics"
-       depends on BCACHEFS_FS
-       help
-       Expose statistics for how long we held a lock in debugfs
-
-config BCACHEFS_NO_LATENCY_ACCT
-	bool "disable latency accounting and time stats"
-	depends on BCACHEFS_FS
-	help
-	This disables device latency tracking and time stats, only for performance testing
-
-config BCACHEFS_SIX_OPTIMISTIC_SPIN
-	bool "Optimistic spinning for six locks"
-	depends on BCACHEFS_FS
-	depends on SMP
-	default y
-	help
-	Instead of immediately sleeping when attempting to take a six lock that
-	is held by another thread, spin for a short while, as long as the
-	thread owning the lock is running.
-
-config MEAN_AND_VARIANCE_UNIT_TEST
-	tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
-	depends on KUNIT
-	depends on BCACHEFS_FS
-	default KUNIT_ALL_TESTS
-	help
-	  This option enables the kunit tests for mean_and_variance module.
-	  If unsure, say N.
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
deleted file mode 100644
index 66ca0bbee639..000000000000
--- a/fs/bcachefs/Makefile
+++ /dev/null
@@ -1,99 +0,0 @@
-
-obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs.o
-
-bcachefs-y		:=	\
-	acl.o			\
-	alloc_background.o	\
-	alloc_foreground.o	\
-	backpointers.o		\
-	bkey.o			\
-	bkey_methods.o		\
-	bkey_sort.o		\
-	bset.o			\
-	btree_cache.o		\
-	btree_gc.o		\
-	btree_io.o		\
-	btree_iter.o		\
-	btree_journal_iter.o	\
-	btree_key_cache.o	\
-	btree_locking.o		\
-	btree_node_scan.o	\
-	btree_trans_commit.o	\
-	btree_update.o		\
-	btree_update_interior.o	\
-	btree_write_buffer.o	\
-	buckets.o		\
-	buckets_waiting_for_journal.o	\
-	chardev.o		\
-	checksum.o		\
-	clock.o			\
-	compress.o		\
-	darray.o		\
-	debug.o			\
-	dirent.o		\
-	disk_groups.o		\
-	data_update.o		\
-	ec.o			\
-	errcode.o		\
-	error.o			\
-	extents.o		\
-	extent_update.o		\
-	eytzinger.o		\
-	fs.o			\
-	fs-common.o		\
-	fs-ioctl.o		\
-	fs-io.o			\
-	fs-io-buffered.o	\
-	fs-io-direct.o		\
-	fs-io-pagecache.o	\
-	fsck.o			\
-	inode.o			\
-	io_read.o		\
-	io_misc.o		\
-	io_write.o		\
-	journal.o		\
-	journal_io.o		\
-	journal_reclaim.o	\
-	journal_sb.o		\
-	journal_seq_blacklist.o	\
-	keylist.o		\
-	logged_ops.o		\
-	lru.o			\
-	mean_and_variance.o	\
-	migrate.o		\
-	move.o			\
-	movinggc.o		\
-	nocow_locking.o		\
-	opts.o			\
-	printbuf.o		\
-	quota.o			\
-	rebalance.o		\
-	recovery.o		\
-	recovery_passes.o	\
-	reflink.o		\
-	replicas.o		\
-	sb-clean.o		\
-	sb-counters.o		\
-	sb-downgrade.o		\
-	sb-errors.o		\
-	sb-members.o		\
-	siphash.o		\
-	six.o			\
-	snapshot.o		\
-	subvolume.o		\
-	super.o			\
-	super-io.o		\
-	sysfs.o			\
-	tests.o			\
-	time_stats.o		\
-	thread_with_file.o	\
-	trace.o			\
-	two_state_shared_lock.o	\
-	util.o			\
-	varint.o		\
-	xattr.o
-
-obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST)   += mean_and_variance_test.o
-
-# Silence "note: xyz changed in GCC X.X" messages
-subdir-ccflags-y += $(call cc-disable-warning, psabi)
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
deleted file mode 100644
index 250d6c6d3a3a..000000000000
--- a/fs/bcachefs/acl.c
+++ /dev/null
@@ -1,447 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-
-#include "acl.h"
-#include "xattr.h"
-
-#include <linux/posix_acl.h>
-
-static const char * const acl_types[] = {
-	[ACL_USER_OBJ]	= "user_obj",
-	[ACL_USER]	= "user",
-	[ACL_GROUP_OBJ]	= "group_obj",
-	[ACL_GROUP]	= "group",
-	[ACL_MASK]	= "mask",
-	[ACL_OTHER]	= "other",
-	NULL,
-};
-
-void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size)
-{
-	const void *p, *end = value + size;
-
-	if (!value ||
-	    size < sizeof(bch_acl_header) ||
-	    ((bch_acl_header *)value)->a_version != cpu_to_le32(BCH_ACL_VERSION))
-		return;
-
-	p = value + sizeof(bch_acl_header);
-	while (p < end) {
-		const bch_acl_entry *in = p;
-		unsigned tag = le16_to_cpu(in->e_tag);
-
-		prt_str(out, acl_types[tag]);
-
-		switch (tag) {
-		case ACL_USER_OBJ:
-		case ACL_GROUP_OBJ:
-		case ACL_MASK:
-		case ACL_OTHER:
-			p += sizeof(bch_acl_entry_short);
-			break;
-		case ACL_USER:
-			prt_printf(out, " uid %u", le32_to_cpu(in->e_id));
-			p += sizeof(bch_acl_entry);
-			break;
-		case ACL_GROUP:
-			prt_printf(out, " gid %u", le32_to_cpu(in->e_id));
-			p += sizeof(bch_acl_entry);
-			break;
-		}
-
-		prt_printf(out, " %o", le16_to_cpu(in->e_perm));
-
-		if (p != end)
-			prt_char(out, ' ');
-	}
-}
-
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-
-#include "fs.h"
-
-#include <linux/fs.h>
-#include <linux/posix_acl_xattr.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-
-static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
-{
-	return sizeof(bch_acl_header) +
-		sizeof(bch_acl_entry_short) * nr_short +
-		sizeof(bch_acl_entry) * nr_long;
-}
-
-static inline int acl_to_xattr_type(int type)
-{
-	switch (type) {
-	case ACL_TYPE_ACCESS:
-		return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS;
-	case ACL_TYPE_DEFAULT:
-		return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT;
-	default:
-		BUG();
-	}
-}
-
-/*
- * Convert from filesystem to in-memory representation.
- */
-static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans,
-					    const void *value, size_t size)
-{
-	const void *p, *end = value + size;
-	struct posix_acl *acl;
-	struct posix_acl_entry *out;
-	unsigned count = 0;
-	int ret;
-
-	if (!value)
-		return NULL;
-	if (size < sizeof(bch_acl_header))
-		goto invalid;
-	if (((bch_acl_header *)value)->a_version !=
-	    cpu_to_le32(BCH_ACL_VERSION))
-		goto invalid;
-
-	p = value + sizeof(bch_acl_header);
-	while (p < end) {
-		const bch_acl_entry *entry = p;
-
-		if (p + sizeof(bch_acl_entry_short) > end)
-			goto invalid;
-
-		switch (le16_to_cpu(entry->e_tag)) {
-		case ACL_USER_OBJ:
-		case ACL_GROUP_OBJ:
-		case ACL_MASK:
-		case ACL_OTHER:
-			p += sizeof(bch_acl_entry_short);
-			break;
-		case ACL_USER:
-		case ACL_GROUP:
-			p += sizeof(bch_acl_entry);
-			break;
-		default:
-			goto invalid;
-		}
-
-		count++;
-	}
-
-	if (p > end)
-		goto invalid;
-
-	if (!count)
-		return NULL;
-
-	acl = allocate_dropping_locks(trans, ret,
-			posix_acl_alloc(count, _gfp));
-	if (!acl)
-		return ERR_PTR(-ENOMEM);
-	if (ret) {
-		kfree(acl);
-		return ERR_PTR(ret);
-	}
-
-	out = acl->a_entries;
-
-	p = value + sizeof(bch_acl_header);
-	while (p < end) {
-		const bch_acl_entry *in = p;
-
-		out->e_tag  = le16_to_cpu(in->e_tag);
-		out->e_perm = le16_to_cpu(in->e_perm);
-
-		switch (out->e_tag) {
-		case ACL_USER_OBJ:
-		case ACL_GROUP_OBJ:
-		case ACL_MASK:
-		case ACL_OTHER:
-			p += sizeof(bch_acl_entry_short);
-			break;
-		case ACL_USER:
-			out->e_uid = make_kuid(&init_user_ns,
-					       le32_to_cpu(in->e_id));
-			p += sizeof(bch_acl_entry);
-			break;
-		case ACL_GROUP:
-			out->e_gid = make_kgid(&init_user_ns,
-					       le32_to_cpu(in->e_id));
-			p += sizeof(bch_acl_entry);
-			break;
-		}
-
-		out++;
-	}
-
-	BUG_ON(out != acl->a_entries + acl->a_count);
-
-	return acl;
-invalid:
-	pr_err("invalid acl entry");
-	return ERR_PTR(-EINVAL);
-}
-
-#define acl_for_each_entry(acl, acl_e)			\
-	for (acl_e = acl->a_entries;			\
-	     acl_e < acl->a_entries + acl->a_count;	\
-	     acl_e++)
-
-/*
- * Convert from in-memory to filesystem representation.
- */
-static struct bkey_i_xattr *
-bch2_acl_to_xattr(struct btree_trans *trans,
-		  const struct posix_acl *acl,
-		  int type)
-{
-	struct bkey_i_xattr *xattr;
-	bch_acl_header *acl_header;
-	const struct posix_acl_entry *acl_e;
-	void *outptr;
-	unsigned nr_short = 0, nr_long = 0, acl_len, u64s;
-
-	acl_for_each_entry(acl, acl_e) {
-		switch (acl_e->e_tag) {
-		case ACL_USER:
-		case ACL_GROUP:
-			nr_long++;
-			break;
-		case ACL_USER_OBJ:
-		case ACL_GROUP_OBJ:
-		case ACL_MASK:
-		case ACL_OTHER:
-			nr_short++;
-			break;
-		default:
-			return ERR_PTR(-EINVAL);
-		}
-	}
-
-	acl_len = bch2_acl_size(nr_short, nr_long);
-	u64s = BKEY_U64s + xattr_val_u64s(0, acl_len);
-
-	if (u64s > U8_MAX)
-		return ERR_PTR(-E2BIG);
-
-	xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
-	if (IS_ERR(xattr))
-		return xattr;
-
-	bkey_xattr_init(&xattr->k_i);
-	xattr->k.u64s		= u64s;
-	xattr->v.x_type		= acl_to_xattr_type(type);
-	xattr->v.x_name_len	= 0;
-	xattr->v.x_val_len	= cpu_to_le16(acl_len);
-
-	acl_header = xattr_val(&xattr->v);
-	acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION);
-
-	outptr = (void *) acl_header + sizeof(*acl_header);
-
-	acl_for_each_entry(acl, acl_e) {
-		bch_acl_entry *entry = outptr;
-
-		entry->e_tag = cpu_to_le16(acl_e->e_tag);
-		entry->e_perm = cpu_to_le16(acl_e->e_perm);
-		switch (acl_e->e_tag) {
-		case ACL_USER:
-			entry->e_id = cpu_to_le32(
-				from_kuid(&init_user_ns, acl_e->e_uid));
-			outptr += sizeof(bch_acl_entry);
-			break;
-		case ACL_GROUP:
-			entry->e_id = cpu_to_le32(
-				from_kgid(&init_user_ns, acl_e->e_gid));
-			outptr += sizeof(bch_acl_entry);
-			break;
-
-		case ACL_USER_OBJ:
-		case ACL_GROUP_OBJ:
-		case ACL_MASK:
-		case ACL_OTHER:
-			outptr += sizeof(bch_acl_entry_short);
-			break;
-		}
-	}
-
-	BUG_ON(outptr != xattr_val(&xattr->v) + acl_len);
-
-	return xattr;
-}
-
-struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
-			       struct dentry *dentry, int type)
-{
-	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
-	struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter = { NULL };
-	struct posix_acl *acl = NULL;
-retry:
-	bch2_trans_begin(trans);
-
-	struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
-					     &hash, inode_inum(inode), &search, 0);
-	int ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
-	acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
-				 le16_to_cpu(xattr.v->x_val_len));
-	ret = PTR_ERR_OR_ZERO(acl);
-err:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	if (ret)
-		acl = !bch2_err_matches(ret, ENOENT) ? ERR_PTR(ret) : NULL;
-
-	if (!IS_ERR_OR_NULL(acl))
-		set_cached_acl(&inode->v, type, acl);
-
-	bch2_trans_iter_exit(trans, &iter);
-	bch2_trans_put(trans);
-	return acl;
-}
-
-int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
-		       struct bch_inode_unpacked *inode_u,
-		       struct posix_acl *acl, int type)
-{
-	struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u);
-	int ret;
-
-	if (type == ACL_TYPE_DEFAULT &&
-	    !S_ISDIR(inode_u->bi_mode))
-		return acl ? -EACCES : 0;
-
-	if (acl) {
-		struct bkey_i_xattr *xattr =
-			bch2_acl_to_xattr(trans, acl, type);
-		if (IS_ERR(xattr))
-			return PTR_ERR(xattr);
-
-		ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info,
-				    inum, &xattr->k_i, 0);
-	} else {
-		struct xattr_search_key search =
-			X_SEARCH(acl_to_xattr_type(type), "", 0);
-
-		ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info,
-				       inum, &search);
-	}
-
-	return bch2_err_matches(ret, ENOENT) ? 0 : ret;
-}
-
-int bch2_set_acl(struct mnt_idmap *idmap,
-		 struct dentry *dentry,
-		 struct posix_acl *_acl, int type)
-{
-	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter inode_iter = { NULL };
-	struct bch_inode_unpacked inode_u;
-	struct posix_acl *acl;
-	umode_t mode;
-	int ret;
-
-	mutex_lock(&inode->ei_update_lock);
-retry:
-	bch2_trans_begin(trans);
-	acl = _acl;
-
-	ret   = bch2_subvol_is_ro_trans(trans, inode->ei_subvol) ?:
-		bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
-			      BTREE_ITER_intent);
-	if (ret)
-		goto btree_err;
-
-	mode = inode_u.bi_mode;
-
-	if (type == ACL_TYPE_ACCESS) {
-		ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl);
-		if (ret)
-			goto btree_err;
-	}
-
-	ret = bch2_set_acl_trans(trans, inode_inum(inode), &inode_u, acl, type);
-	if (ret)
-		goto btree_err;
-
-	inode_u.bi_ctime	= bch2_current_time(c);
-	inode_u.bi_mode		= mode;
-
-	ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
-		bch2_trans_commit(trans, NULL, NULL, 0);
-btree_err:
-	bch2_trans_iter_exit(trans, &inode_iter);
-
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-	if (unlikely(ret))
-		goto err;
-
-	bch2_inode_update_after_write(trans, inode, &inode_u,
-				      ATTR_CTIME|ATTR_MODE);
-
-	set_cached_acl(&inode->v, type, acl);
-err:
-	mutex_unlock(&inode->ei_update_lock);
-	bch2_trans_put(trans);
-
-	return ret;
-}
-
-int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
-		   struct bch_inode_unpacked *inode,
-		   umode_t mode,
-		   struct posix_acl **new_acl)
-{
-	struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode);
-	struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0);
-	struct btree_iter iter;
-	struct posix_acl *acl = NULL;
-
-	struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
-			       &hash_info, inum, &search, BTREE_ITER_intent);
-	int ret = bkey_err(k);
-	if (ret)
-		return bch2_err_matches(ret, ENOENT) ? 0 : ret;
-
-	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
-
-	acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
-			le16_to_cpu(xattr.v->x_val_len));
-	ret = PTR_ERR_OR_ZERO(acl);
-	if (ret)
-		goto err;
-
-	ret = allocate_dropping_locks_errcode(trans, __posix_acl_chmod(&acl, _gfp, mode));
-	if (ret)
-		goto err;
-
-	struct bkey_i_xattr *new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
-	ret = PTR_ERR_OR_ZERO(new);
-	if (ret)
-		goto err;
-
-	new->k.p = iter.pos;
-	ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
-	*new_acl = acl;
-	acl = NULL;
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	if (!IS_ERR_OR_NULL(acl))
-		kfree(acl);
-	return ret;
-}
-
-#endif /* CONFIG_BCACHEFS_POSIX_ACL */
diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
deleted file mode 100644
index 27e7eec0f278..000000000000
--- a/fs/bcachefs/acl.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ACL_H
-#define _BCACHEFS_ACL_H
-
-struct bch_inode_unpacked;
-struct bch_hash_info;
-struct bch_inode_info;
-struct posix_acl;
-
-#define BCH_ACL_VERSION	0x0001
-
-typedef struct {
-	__le16		e_tag;
-	__le16		e_perm;
-	__le32		e_id;
-} bch_acl_entry;
-
-typedef struct {
-	__le16		e_tag;
-	__le16		e_perm;
-} bch_acl_entry_short;
-
-typedef struct {
-	__le32		a_version;
-} bch_acl_header;
-
-void bch2_acl_to_text(struct printbuf *, const void *, size_t);
-
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-
-struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int);
-
-int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
-		       struct bch_inode_unpacked *,
-		       struct posix_acl *, int);
-int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int);
-int bch2_acl_chmod(struct btree_trans *, subvol_inum,
-		   struct bch_inode_unpacked *,
-		   umode_t, struct posix_acl **);
-
-#else
-
-static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
-				     struct bch_inode_unpacked *inode_u,
-				     struct posix_acl *acl, int type)
-{
-	return 0;
-}
-
-static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
-				 struct bch_inode_unpacked *inode,
-				 umode_t mode,
-				 struct posix_acl **new_acl)
-{
-	return 0;
-}
-
-#endif /* CONFIG_BCACHEFS_POSIX_ACL */
-
-#endif /* _BCACHEFS_ACL_H */
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
deleted file mode 100644
index 346cd91f91f9..000000000000
--- a/fs/bcachefs/alloc_background.c
+++ /dev/null
@@ -1,2358 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_key_cache.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_gc.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "buckets_waiting_for_journal.h"
-#include "clock.h"
-#include "debug.h"
-#include "ec.h"
-#include "error.h"
-#include "lru.h"
-#include "recovery.h"
-#include "trace.h"
-#include "varint.h"
-
-#include <linux/kthread.h>
-#include <linux/math64.h>
-#include <linux/random.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-#include <linux/sched/task.h>
-#include <linux/sort.h>
-
-static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket);
-
-/* Persistent alloc info: */
-
-static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
-#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
-	BCH_ALLOC_FIELDS_V1()
-#undef x
-};
-
-struct bkey_alloc_unpacked {
-	u64		journal_seq;
-	u8		gen;
-	u8		oldest_gen;
-	u8		data_type;
-	bool		need_discard:1;
-	bool		need_inc_gen:1;
-#define x(_name, _bits)	u##_bits _name;
-	BCH_ALLOC_FIELDS_V2()
-#undef  x
-};
-
-static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
-				     const void **p, unsigned field)
-{
-	unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
-	u64 v;
-
-	if (!(a->fields & (1 << field)))
-		return 0;
-
-	switch (bytes) {
-	case 1:
-		v = *((const u8 *) *p);
-		break;
-	case 2:
-		v = le16_to_cpup(*p);
-		break;
-	case 4:
-		v = le32_to_cpup(*p);
-		break;
-	case 8:
-		v = le64_to_cpup(*p);
-		break;
-	default:
-		BUG();
-	}
-
-	*p += bytes;
-	return v;
-}
-
-static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
-				 struct bkey_s_c k)
-{
-	const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
-	const void *d = in->data;
-	unsigned idx = 0;
-
-	out->gen = in->gen;
-
-#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
-	BCH_ALLOC_FIELDS_V1()
-#undef  x
-}
-
-static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
-				struct bkey_s_c k)
-{
-	struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
-	const u8 *in = a.v->data;
-	const u8 *end = bkey_val_end(a);
-	unsigned fieldnr = 0;
-	int ret;
-	u64 v;
-
-	out->gen	= a.v->gen;
-	out->oldest_gen	= a.v->oldest_gen;
-	out->data_type	= a.v->data_type;
-
-#define x(_name, _bits)							\
-	if (fieldnr < a.v->nr_fields) {					\
-		ret = bch2_varint_decode_fast(in, end, &v);		\
-		if (ret < 0)						\
-			return ret;					\
-		in += ret;						\
-	} else {							\
-		v = 0;							\
-	}								\
-	out->_name = v;							\
-	if (v != out->_name)						\
-		return -1;						\
-	fieldnr++;
-
-	BCH_ALLOC_FIELDS_V2()
-#undef  x
-	return 0;
-}
-
-static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
-				struct bkey_s_c k)
-{
-	struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
-	const u8 *in = a.v->data;
-	const u8 *end = bkey_val_end(a);
-	unsigned fieldnr = 0;
-	int ret;
-	u64 v;
-
-	out->gen	= a.v->gen;
-	out->oldest_gen	= a.v->oldest_gen;
-	out->data_type	= a.v->data_type;
-	out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
-	out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
-	out->journal_seq = le64_to_cpu(a.v->journal_seq);
-
-#define x(_name, _bits)							\
-	if (fieldnr < a.v->nr_fields) {					\
-		ret = bch2_varint_decode_fast(in, end, &v);		\
-		if (ret < 0)						\
-			return ret;					\
-		in += ret;						\
-	} else {							\
-		v = 0;							\
-	}								\
-	out->_name = v;							\
-	if (v != out->_name)						\
-		return -1;						\
-	fieldnr++;
-
-	BCH_ALLOC_FIELDS_V2()
-#undef  x
-	return 0;
-}
-
-static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
-{
-	struct bkey_alloc_unpacked ret = { .gen	= 0 };
-
-	switch (k.k->type) {
-	case KEY_TYPE_alloc:
-		bch2_alloc_unpack_v1(&ret, k);
-		break;
-	case KEY_TYPE_alloc_v2:
-		bch2_alloc_unpack_v2(&ret, k);
-		break;
-	case KEY_TYPE_alloc_v3:
-		bch2_alloc_unpack_v3(&ret, k);
-		break;
-	}
-
-	return ret;
-}
-
-static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
-{
-	unsigned i, bytes = offsetof(struct bch_alloc, data);
-
-	for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
-		if (a->fields & (1 << i))
-			bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
-
-	return DIV_ROUND_UP(bytes, sizeof(u64));
-}
-
-int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k,
-			  enum bch_validate_flags flags,
-			  struct printbuf *err)
-{
-	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
-	int ret = 0;
-
-	/* allow for unknown fields */
-	bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err,
-			 alloc_v1_val_size_bad,
-			 "incorrect value size (%zu < %u)",
-			 bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
-fsck_err:
-	return ret;
-}
-
-int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
-			  enum bch_validate_flags flags,
-			  struct printbuf *err)
-{
-	struct bkey_alloc_unpacked u;
-	int ret = 0;
-
-	bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err,
-			 alloc_v2_unpack_error,
-			 "unpack error");
-fsck_err:
-	return ret;
-}
-
-int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
-			  enum bch_validate_flags flags,
-			  struct printbuf *err)
-{
-	struct bkey_alloc_unpacked u;
-	int ret = 0;
-
-	bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err,
-			 alloc_v2_unpack_error,
-			 "unpack error");
-fsck_err:
-	return ret;
-}
-
-int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
-			  enum bch_validate_flags flags, struct printbuf *err)
-{
-	struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
-	int ret = 0;
-
-	bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k), c, err,
-			 alloc_v4_val_size_bad,
-			 "bad val size (%u > %zu)",
-			 alloc_v4_u64s_noerror(a.v), bkey_val_u64s(k.k));
-
-	bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
-			 BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err,
-			 alloc_v4_backpointers_start_bad,
-			 "invalid backpointers_start");
-
-	bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err,
-			 alloc_key_data_type_bad,
-			 "invalid data type (got %u should be %u)",
-			 a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
-
-	switch (a.v->data_type) {
-	case BCH_DATA_free:
-	case BCH_DATA_need_gc_gens:
-	case BCH_DATA_need_discard:
-		bkey_fsck_err_on(bch2_bucket_sectors_total(*a.v) || a.v->stripe,
-				 c, err, alloc_key_empty_but_have_data,
-				 "empty data type free but have data");
-		break;
-	case BCH_DATA_sb:
-	case BCH_DATA_journal:
-	case BCH_DATA_btree:
-	case BCH_DATA_user:
-	case BCH_DATA_parity:
-		bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v),
-				 c, err, alloc_key_dirty_sectors_0,
-				 "data_type %s but dirty_sectors==0",
-				 bch2_data_type_str(a.v->data_type));
-		break;
-	case BCH_DATA_cached:
-		bkey_fsck_err_on(!a.v->cached_sectors ||
-				 bch2_bucket_sectors_dirty(*a.v) ||
-				 a.v->stripe,
-				 c, err, alloc_key_cached_inconsistency,
-				 "data type inconsistency");
-
-		bkey_fsck_err_on(!a.v->io_time[READ] &&
-				 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
-				 c, err, alloc_key_cached_but_read_time_zero,
-				 "cached bucket with read_time == 0");
-		break;
-	case BCH_DATA_stripe:
-		break;
-	}
-fsck_err:
-	return ret;
-}
-
-void bch2_alloc_v4_swab(struct bkey_s k)
-{
-	struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
-	struct bch_backpointer *bp, *bps;
-
-	a->journal_seq		= swab64(a->journal_seq);
-	a->flags		= swab32(a->flags);
-	a->dirty_sectors	= swab32(a->dirty_sectors);
-	a->cached_sectors	= swab32(a->cached_sectors);
-	a->io_time[0]		= swab64(a->io_time[0]);
-	a->io_time[1]		= swab64(a->io_time[1]);
-	a->stripe		= swab32(a->stripe);
-	a->nr_external_backpointers = swab32(a->nr_external_backpointers);
-	a->fragmentation_lru	= swab64(a->fragmentation_lru);
-
-	bps = alloc_v4_backpointers(a);
-	for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
-		bp->bucket_offset	= swab40(bp->bucket_offset);
-		bp->bucket_len		= swab32(bp->bucket_len);
-		bch2_bpos_swab(&bp->pos);
-	}
-}
-
-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bch_alloc_v4 _a;
-	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
-
-	prt_newline(out);
-	printbuf_indent_add(out, 2);
-
-	prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
-	bch2_prt_data_type(out, a->data_type);
-	prt_newline(out);
-	prt_printf(out, "journal_seq       %llu\n",	a->journal_seq);
-	prt_printf(out, "need_discard      %llu\n",	BCH_ALLOC_V4_NEED_DISCARD(a));
-	prt_printf(out, "need_inc_gen      %llu\n",	BCH_ALLOC_V4_NEED_INC_GEN(a));
-	prt_printf(out, "dirty_sectors     %u\n",	a->dirty_sectors);
-	prt_printf(out, "cached_sectors    %u\n",	a->cached_sectors);
-	prt_printf(out, "stripe            %u\n",	a->stripe);
-	prt_printf(out, "stripe_redundancy %u\n",	a->stripe_redundancy);
-	prt_printf(out, "io_time[READ]     %llu\n",	a->io_time[READ]);
-	prt_printf(out, "io_time[WRITE]    %llu\n",	a->io_time[WRITE]);
-	prt_printf(out, "fragmentation     %llu\n",	a->fragmentation_lru);
-	prt_printf(out, "bp_start          %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a));
-	printbuf_indent_sub(out, 2);
-}
-
-void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
-{
-	if (k.k->type == KEY_TYPE_alloc_v4) {
-		void *src, *dst;
-
-		*out = *bkey_s_c_to_alloc_v4(k).v;
-
-		src = alloc_v4_backpointers(out);
-		SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
-		dst = alloc_v4_backpointers(out);
-
-		if (src < dst)
-			memset(src, 0, dst - src);
-
-		SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0);
-	} else {
-		struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
-
-		*out = (struct bch_alloc_v4) {
-			.journal_seq		= u.journal_seq,
-			.flags			= u.need_discard,
-			.gen			= u.gen,
-			.oldest_gen		= u.oldest_gen,
-			.data_type		= u.data_type,
-			.stripe_redundancy	= u.stripe_redundancy,
-			.dirty_sectors		= u.dirty_sectors,
-			.cached_sectors		= u.cached_sectors,
-			.io_time[READ]		= u.read_time,
-			.io_time[WRITE]		= u.write_time,
-			.stripe			= u.stripe,
-		};
-
-		SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
-	}
-}
-
-static noinline struct bkey_i_alloc_v4 *
-__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
-{
-	struct bkey_i_alloc_v4 *ret;
-
-	ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4)));
-	if (IS_ERR(ret))
-		return ret;
-
-	if (k.k->type == KEY_TYPE_alloc_v4) {
-		void *src, *dst;
-
-		bkey_reassemble(&ret->k_i, k);
-
-		src = alloc_v4_backpointers(&ret->v);
-		SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
-		dst = alloc_v4_backpointers(&ret->v);
-
-		if (src < dst)
-			memset(src, 0, dst - src);
-
-		SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0);
-		set_alloc_v4_u64s(ret);
-	} else {
-		bkey_alloc_v4_init(&ret->k_i);
-		ret->k.p = k.k->p;
-		bch2_alloc_to_v4(k, &ret->v);
-	}
-	return ret;
-}
-
-static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
-{
-	struct bkey_s_c_alloc_v4 a;
-
-	if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
-	    ((a = bkey_s_c_to_alloc_v4(k), true) &&
-	     BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0))
-		return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4);
-
-	return __bch2_alloc_to_v4_mut(trans, k);
-}
-
-struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
-{
-	return bch2_alloc_to_v4_mut_inlined(trans, k);
-}
-
-struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter,
-				       struct bpos pos)
-{
-	struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
-					       BTREE_ITER_with_updates|
-					       BTREE_ITER_cached|
-					       BTREE_ITER_intent);
-	int ret = bkey_err(k);
-	if (unlikely(ret))
-		return ERR_PTR(ret);
-
-	struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k);
-	ret = PTR_ERR_OR_ZERO(a);
-	if (unlikely(ret))
-		goto err;
-	return a;
-err:
-	bch2_trans_iter_exit(trans, iter);
-	return ERR_PTR(ret);
-}
-
-__flatten
-struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos)
-{
-	struct btree_iter iter;
-	struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos);
-	int ret = PTR_ERR_OR_ZERO(a);
-	if (ret)
-		return ERR_PTR(ret);
-
-	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
-	bch2_trans_iter_exit(trans, &iter);
-	return unlikely(ret) ? ERR_PTR(ret) : a;
-}
-
-static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset)
-{
-	*offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK;
-
-	pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS;
-	return pos;
-}
-
-static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset)
-{
-	pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS;
-	pos.offset += offset;
-	return pos;
-}
-
-static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
-{
-	return k.k->type == KEY_TYPE_bucket_gens
-		? bkey_s_c_to_bucket_gens(k).v->gens[offset]
-		: 0;
-}
-
-int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k,
-			     enum bch_validate_flags flags,
-			     struct printbuf *err)
-{
-	int ret = 0;
-
-	bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err,
-			 bucket_gens_val_size_bad,
-			 "bad val size (%zu != %zu)",
-			 bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
-fsck_err:
-	return ret;
-}
-
-void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k);
-	unsigned i;
-
-	for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) {
-		if (i)
-			prt_char(out, ' ');
-		prt_printf(out, "%u", g.v->gens[i]);
-	}
-}
-
-int bch2_bucket_gens_init(struct bch_fs *c)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct bkey_i_bucket_gens g;
-	bool have_bucket_gens_key = false;
-	int ret;
-
-	ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
-				 BTREE_ITER_prefetch, k, ({
-		/*
-		 * Not a fsck error because this is checked/repaired by
-		 * bch2_check_alloc_key() which runs later:
-		 */
-		if (!bch2_dev_bucket_exists(c, k.k->p))
-			continue;
-
-		struct bch_alloc_v4 a;
-		u8 gen = bch2_alloc_to_v4(k, &a)->gen;
-		unsigned offset;
-		struct bpos pos = alloc_gens_pos(iter.pos, &offset);
-		int ret2 = 0;
-
-		if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
-			ret2 =  bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?:
-				bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-			if (ret2)
-				goto iter_err;
-			have_bucket_gens_key = false;
-		}
-
-		if (!have_bucket_gens_key) {
-			bkey_bucket_gens_init(&g.k_i);
-			g.k.p = pos;
-			have_bucket_gens_key = true;
-		}
-
-		g.v.gens[offset] = gen;
-iter_err:
-		ret2;
-	}));
-
-	if (have_bucket_gens_key && !ret)
-		ret = commit_do(trans, NULL, NULL,
-				BCH_TRANS_COMMIT_no_enospc,
-			bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
-
-	bch2_trans_put(trans);
-
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-int bch2_alloc_read(struct bch_fs *c)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct bch_dev *ca = NULL;
-	int ret;
-
-	down_read(&c->gc_lock);
-
-	if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
-		ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
-					 BTREE_ITER_prefetch, k, ({
-			u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
-			u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
-
-			if (k.k->type != KEY_TYPE_bucket_gens)
-				continue;
-
-			ca = bch2_dev_iterate(c, ca, k.k->p.inode);
-			/*
-			 * Not a fsck error because this is checked/repaired by
-			 * bch2_check_alloc_key() which runs later:
-			 */
-			if (!ca) {
-				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
-				continue;
-			}
-
-			const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v;
-
-			for (u64 b = max_t(u64, ca->mi.first_bucket, start);
-			     b < min_t(u64, ca->mi.nbuckets, end);
-			     b++)
-				*bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
-			0;
-		}));
-	} else {
-		ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
-					 BTREE_ITER_prefetch, k, ({
-			ca = bch2_dev_iterate(c, ca, k.k->p.inode);
-			/*
-			 * Not a fsck error because this is checked/repaired by
-			 * bch2_check_alloc_key() which runs later:
-			 */
-			if (!ca) {
-				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
-				continue;
-			}
-
-			struct bch_alloc_v4 a;
-			*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
-			0;
-		}));
-	}
-
-	bch2_dev_put(ca);
-	bch2_trans_put(trans);
-	up_read(&c->gc_lock);
-
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-/* Free space/discard btree: */
-
-static int bch2_bucket_do_index(struct btree_trans *trans,
-				struct bch_dev *ca,
-				struct bkey_s_c alloc_k,
-				const struct bch_alloc_v4 *a,
-				bool set)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c old;
-	struct bkey_i *k;
-	enum btree_id btree;
-	enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
-	enum bch_bkey_type new_type =  set ? KEY_TYPE_set : KEY_TYPE_deleted;
-	struct printbuf buf = PRINTBUF;
-	int ret;
-
-	if (a->data_type != BCH_DATA_free &&
-	    a->data_type != BCH_DATA_need_discard)
-		return 0;
-
-	k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
-	if (IS_ERR(k))
-		return PTR_ERR(k);
-
-	bkey_init(&k->k);
-	k->k.type = new_type;
-
-	switch (a->data_type) {
-	case BCH_DATA_free:
-		btree = BTREE_ID_freespace;
-		k->k.p = alloc_freespace_pos(alloc_k.k->p, *a);
-		bch2_key_resize(&k->k, 1);
-		break;
-	case BCH_DATA_need_discard:
-		btree = BTREE_ID_need_discard;
-		k->k.p = alloc_k.k->p;
-		break;
-	default:
-		return 0;
-	}
-
-	old = bch2_bkey_get_iter(trans, &iter, btree,
-			     bkey_start_pos(&k->k),
-			     BTREE_ITER_intent);
-	ret = bkey_err(old);
-	if (ret)
-		return ret;
-
-	if (ca->mi.freespace_initialized &&
-	    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info &&
-	    bch2_trans_inconsistent_on(old.k->type != old_type, trans,
-			"incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n"
-			"  for %s",
-			set ? "setting" : "clearing",
-			bch2_btree_id_str(btree),
-			iter.pos.inode,
-			iter.pos.offset,
-			bch2_bkey_types[old.k->type],
-			bch2_bkey_types[old_type],
-			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
-		ret = -EIO;
-		goto err;
-	}
-
-	ret = bch2_trans_update(trans, &iter, k, 0);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
-					   struct bpos bucket, u8 gen)
-{
-	struct btree_iter iter;
-	unsigned offset;
-	struct bpos pos = alloc_gens_pos(bucket, &offset);
-	struct bkey_i_bucket_gens *g;
-	struct bkey_s_c k;
-	int ret;
-
-	g = bch2_trans_kmalloc(trans, sizeof(*g));
-	ret = PTR_ERR_OR_ZERO(g);
-	if (ret)
-		return ret;
-
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos,
-			       BTREE_ITER_intent|
-			       BTREE_ITER_with_updates);
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	if (k.k->type != KEY_TYPE_bucket_gens) {
-		bkey_bucket_gens_init(&g->k_i);
-		g->k.p = iter.pos;
-	} else {
-		bkey_reassemble(&g->k_i, k);
-	}
-
-	g->v.gens[offset] = gen;
-
-	ret = bch2_trans_update(trans, &iter, &g->k_i, 0);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_trigger_alloc(struct btree_trans *trans,
-		       enum btree_id btree, unsigned level,
-		       struct bkey_s_c old, struct bkey_s new,
-		       enum btree_iter_update_trigger_flags flags)
-{
-	struct bch_fs *c = trans->c;
-	int ret = 0;
-
-	struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
-	if (!ca)
-		return -EIO;
-
-	struct bch_alloc_v4 old_a_convert;
-	const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
-
-	if (flags & BTREE_TRIGGER_transactional) {
-		struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
-
-		alloc_data_type_set(new_a, new_a->data_type);
-
-		if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) {
-			new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
-			new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
-			SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
-			SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
-		}
-
-		if (data_type_is_empty(new_a->data_type) &&
-		    BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
-		    !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
-			new_a->gen++;
-			SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
-		}
-
-		if (old_a->data_type != new_a->data_type ||
-		    (new_a->data_type == BCH_DATA_free &&
-		     alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
-			ret =   bch2_bucket_do_index(trans, ca, old, old_a, false) ?:
-				bch2_bucket_do_index(trans, ca, new.s_c, new_a, true);
-			if (ret)
-				goto err;
-		}
-
-		if (new_a->data_type == BCH_DATA_cached &&
-		    !new_a->io_time[READ])
-			new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
-
-		u64 old_lru = alloc_lru_idx_read(*old_a);
-		u64 new_lru = alloc_lru_idx_read(*new_a);
-		if (old_lru != new_lru) {
-			ret = bch2_lru_change(trans, new.k->p.inode,
-					      bucket_to_u64(new.k->p),
-					      old_lru, new_lru);
-			if (ret)
-				goto err;
-		}
-
-		new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, ca);
-		if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
-			ret = bch2_lru_change(trans,
-					BCH_LRU_FRAGMENTATION_START,
-					bucket_to_u64(new.k->p),
-					old_a->fragmentation_lru, new_a->fragmentation_lru);
-			if (ret)
-				goto err;
-		}
-
-		if (old_a->gen != new_a->gen) {
-			ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen);
-			if (ret)
-				goto err;
-		}
-
-		/*
-		 * need to know if we're getting called from the invalidate path or
-		 * not:
-		 */
-
-		if ((flags & BTREE_TRIGGER_bucket_invalidate) &&
-		    old_a->cached_sectors) {
-			ret = bch2_update_cached_sectors_list(trans, new.k->p.inode,
-							      -((s64) old_a->cached_sectors));
-			if (ret)
-				goto err;
-		}
-	}
-
-	if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
-		struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
-		u64 journal_seq = trans->journal_res.seq;
-		u64 bucket_journal_seq = new_a->journal_seq;
-
-		if ((flags & BTREE_TRIGGER_insert) &&
-		    data_type_is_empty(old_a->data_type) !=
-		    data_type_is_empty(new_a->data_type) &&
-		    new.k->type == KEY_TYPE_alloc_v4) {
-			struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v;
-
-			/*
-			 * If the btree updates referring to a bucket weren't flushed
-			 * before the bucket became empty again, then the we don't have
-			 * to wait on a journal flush before we can reuse the bucket:
-			 */
-			v->journal_seq = bucket_journal_seq =
-				data_type_is_empty(new_a->data_type) &&
-				(journal_seq == v->journal_seq ||
-				 bch2_journal_noflush_seq(&c->journal, v->journal_seq))
-				? 0 : journal_seq;
-		}
-
-		if (!data_type_is_empty(old_a->data_type) &&
-		    data_type_is_empty(new_a->data_type) &&
-		    bucket_journal_seq) {
-			ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
-					c->journal.flushed_seq_ondisk,
-					new.k->p.inode, new.k->p.offset,
-					bucket_journal_seq);
-			if (ret) {
-				bch2_fs_fatal_error(c,
-					"setting bucket_needs_journal_commit: %s", bch2_err_str(ret));
-				goto err;
-			}
-		}
-
-		percpu_down_read(&c->mark_lock);
-		if (new_a->gen != old_a->gen)
-			*bucket_gen(ca, new.k->p.offset) = new_a->gen;
-
-		bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false);
-		percpu_up_read(&c->mark_lock);
-
-#define eval_state(_a, expr)		({ const struct bch_alloc_v4 *a = _a; expr; })
-#define statechange(expr)		!eval_state(old_a, expr) && eval_state(new_a, expr)
-#define bucket_flushed(a)		(!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk)
-
-		if (statechange(a->data_type == BCH_DATA_free) &&
-		    bucket_flushed(new_a))
-			closure_wake_up(&c->freelist_wait);
-
-		if (statechange(a->data_type == BCH_DATA_need_discard) &&
-		    !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
-		    bucket_flushed(new_a))
-			bch2_discard_one_bucket_fast(c, new.k->p);
-
-		if (statechange(a->data_type == BCH_DATA_cached) &&
-		    !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
-		    should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
-			bch2_do_invalidates(c);
-
-		if (statechange(a->data_type == BCH_DATA_need_gc_gens))
-			bch2_gc_gens_async(c);
-	}
-
-	if ((flags & BTREE_TRIGGER_gc) &&
-	    (flags & BTREE_TRIGGER_bucket_invalidate)) {
-		struct bch_alloc_v4 new_a_convert;
-		const struct bch_alloc_v4 *new_a = bch2_alloc_to_v4(new.s_c, &new_a_convert);
-
-		percpu_down_read(&c->mark_lock);
-		struct bucket *g = gc_bucket(ca, new.k->p.offset);
-
-		bucket_lock(g);
-
-		g->gen_valid		= 1;
-		g->gen			= new_a->gen;
-		g->data_type		= new_a->data_type;
-		g->stripe		= new_a->stripe;
-		g->stripe_redundancy	= new_a->stripe_redundancy;
-		g->dirty_sectors	= new_a->dirty_sectors;
-		g->cached_sectors	= new_a->cached_sectors;
-
-		bucket_unlock(g);
-		percpu_up_read(&c->mark_lock);
-	}
-err:
-	bch2_dev_put(ca);
-	return ret;
-}
-
-/*
- * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for
- * extents style btrees, but works on non-extents btrees:
- */
-static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
-{
-	struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
-
-	if (bkey_err(k))
-		return k;
-
-	if (k.k->type) {
-		return k;
-	} else {
-		struct btree_iter iter2;
-		struct bpos next;
-
-		bch2_trans_copy_iter(&iter2, iter);
-
-		struct btree_path *path = btree_iter_path(iter->trans, iter);
-		if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX))
-			end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p));
-
-		end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1));
-
-		/*
-		 * btree node min/max is a closed interval, upto takes a half
-		 * open interval:
-		 */
-		k = bch2_btree_iter_peek_upto(&iter2, end);
-		next = iter2.pos;
-		bch2_trans_iter_exit(iter->trans, &iter2);
-
-		BUG_ON(next.offset >= iter->pos.offset + U32_MAX);
-
-		if (bkey_err(k))
-			return k;
-
-		bkey_init(hole);
-		hole->p = iter->pos;
-
-		bch2_key_resize(hole, next.offset - iter->pos.offset);
-		return (struct bkey_s_c) { hole, NULL };
-	}
-}
-
-static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket)
-{
-	if (*ca) {
-		if (bucket->offset < (*ca)->mi.first_bucket)
-			bucket->offset = (*ca)->mi.first_bucket;
-
-		if (bucket->offset < (*ca)->mi.nbuckets)
-			return true;
-
-		bch2_dev_put(*ca);
-		*ca = NULL;
-		bucket->inode++;
-		bucket->offset = 0;
-	}
-
-	rcu_read_lock();
-	*ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
-	if (*ca) {
-		*bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket);
-		bch2_dev_get(*ca);
-	}
-	rcu_read_unlock();
-
-	return *ca != NULL;
-}
-
-static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter,
-					struct bch_dev **ca, struct bkey *hole)
-{
-	struct bch_fs *c = iter->trans->c;
-	struct bkey_s_c k;
-again:
-	k = bch2_get_key_or_hole(iter, POS_MAX, hole);
-	if (bkey_err(k))
-		return k;
-
-	*ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode);
-
-	if (!k.k->type) {
-		struct bpos hole_start = bkey_start_pos(k.k);
-
-		if (!*ca || !bucket_valid(*ca, hole_start.offset)) {
-			if (!next_bucket(c, ca, &hole_start))
-				return bkey_s_c_null;
-
-			bch2_btree_iter_set_pos(iter, hole_start);
-			goto again;
-		}
-
-		if (k.k->p.offset > (*ca)->mi.nbuckets)
-			bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset);
-	}
-
-	return k;
-}
-
-static noinline_for_stack
-int bch2_check_alloc_key(struct btree_trans *trans,
-			 struct bkey_s_c alloc_k,
-			 struct btree_iter *alloc_iter,
-			 struct btree_iter *discard_iter,
-			 struct btree_iter *freespace_iter,
-			 struct btree_iter *bucket_gens_iter)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_alloc_v4 a_convert;
-	const struct bch_alloc_v4 *a;
-	unsigned discard_key_type, freespace_key_type;
-	unsigned gens_offset;
-	struct bkey_s_c k;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p);
-	if (fsck_err_on(!ca,
-			c, alloc_key_to_missing_dev_bucket,
-			"alloc key for invalid device:bucket %llu:%llu",
-			alloc_k.k->p.inode, alloc_k.k->p.offset))
-		ret = bch2_btree_delete_at(trans, alloc_iter, 0);
-	if (!ca)
-		return ret;
-
-	if (!ca->mi.freespace_initialized)
-		goto out;
-
-	a = bch2_alloc_to_v4(alloc_k, &a_convert);
-
-	discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0;
-	bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
-	k = bch2_btree_iter_peek_slot(discard_iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (fsck_err_on(k.k->type != discard_key_type,
-			c, need_discard_key_wrong,
-			"incorrect key in need_discard btree (got %s should be %s)\n"
-			"  %s",
-			bch2_bkey_types[k.k->type],
-			bch2_bkey_types[discard_key_type],
-			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
-		struct bkey_i *update =
-			bch2_trans_kmalloc(trans, sizeof(*update));
-
-		ret = PTR_ERR_OR_ZERO(update);
-		if (ret)
-			goto err;
-
-		bkey_init(&update->k);
-		update->k.type	= discard_key_type;
-		update->k.p	= discard_iter->pos;
-
-		ret = bch2_trans_update(trans, discard_iter, update, 0);
-		if (ret)
-			goto err;
-	}
-
-	freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0;
-	bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
-	k = bch2_btree_iter_peek_slot(freespace_iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (fsck_err_on(k.k->type != freespace_key_type,
-			c, freespace_key_wrong,
-			"incorrect key in freespace btree (got %s should be %s)\n"
-			"  %s",
-			bch2_bkey_types[k.k->type],
-			bch2_bkey_types[freespace_key_type],
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
-		struct bkey_i *update =
-			bch2_trans_kmalloc(trans, sizeof(*update));
-
-		ret = PTR_ERR_OR_ZERO(update);
-		if (ret)
-			goto err;
-
-		bkey_init(&update->k);
-		update->k.type	= freespace_key_type;
-		update->k.p	= freespace_iter->pos;
-		bch2_key_resize(&update->k, 1);
-
-		ret = bch2_trans_update(trans, freespace_iter, update, 0);
-		if (ret)
-			goto err;
-	}
-
-	bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
-	k = bch2_btree_iter_peek_slot(bucket_gens_iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
-			c, bucket_gens_key_wrong,
-			"incorrect gen in bucket_gens btree (got %u should be %u)\n"
-			"  %s",
-			alloc_gen(k, gens_offset), a->gen,
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
-		struct bkey_i_bucket_gens *g =
-			bch2_trans_kmalloc(trans, sizeof(*g));
-
-		ret = PTR_ERR_OR_ZERO(g);
-		if (ret)
-			goto err;
-
-		if (k.k->type == KEY_TYPE_bucket_gens) {
-			bkey_reassemble(&g->k_i, k);
-		} else {
-			bkey_bucket_gens_init(&g->k_i);
-			g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset);
-		}
-
-		g->v.gens[gens_offset] = a->gen;
-
-		ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0);
-		if (ret)
-			goto err;
-	}
-out:
-err:
-fsck_err:
-	bch2_dev_put(ca);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static noinline_for_stack
-int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
-				    struct bch_dev *ca,
-				    struct bpos start,
-				    struct bpos *end,
-				    struct btree_iter *freespace_iter)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_s_c k;
-	struct printbuf buf = PRINTBUF;
-	int ret;
-
-	if (!ca->mi.freespace_initialized)
-		return 0;
-
-	bch2_btree_iter_set_pos(freespace_iter, start);
-
-	k = bch2_btree_iter_peek_slot(freespace_iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	*end = bkey_min(k.k->p, *end);
-
-	if (fsck_err_on(k.k->type != KEY_TYPE_set,
-			c, freespace_hole_missing,
-			"hole in alloc btree missing in freespace btree\n"
-			"  device %llu buckets %llu-%llu",
-			freespace_iter->pos.inode,
-			freespace_iter->pos.offset,
-			end->offset)) {
-		struct bkey_i *update =
-			bch2_trans_kmalloc(trans, sizeof(*update));
-
-		ret = PTR_ERR_OR_ZERO(update);
-		if (ret)
-			goto err;
-
-		bkey_init(&update->k);
-		update->k.type	= KEY_TYPE_set;
-		update->k.p	= freespace_iter->pos;
-		bch2_key_resize(&update->k,
-				min_t(u64, U32_MAX, end->offset -
-				      freespace_iter->pos.offset));
-
-		ret = bch2_trans_update(trans, freespace_iter, update, 0);
-		if (ret)
-			goto err;
-	}
-err:
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static noinline_for_stack
-int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
-				      struct bpos start,
-				      struct bpos *end,
-				      struct btree_iter *bucket_gens_iter)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_s_c k;
-	struct printbuf buf = PRINTBUF;
-	unsigned i, gens_offset, gens_end_offset;
-	int ret;
-
-	bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
-
-	k = bch2_btree_iter_peek_slot(bucket_gens_iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (bkey_cmp(alloc_gens_pos(start, &gens_offset),
-		     alloc_gens_pos(*end,  &gens_end_offset)))
-		gens_end_offset = KEY_TYPE_BUCKET_GENS_NR;
-
-	if (k.k->type == KEY_TYPE_bucket_gens) {
-		struct bkey_i_bucket_gens g;
-		bool need_update = false;
-
-		bkey_reassemble(&g.k_i, k);
-
-		for (i = gens_offset; i < gens_end_offset; i++) {
-			if (fsck_err_on(g.v.gens[i], c,
-					bucket_gens_hole_wrong,
-					"hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
-					bucket_gens_pos_to_alloc(k.k->p, i).inode,
-					bucket_gens_pos_to_alloc(k.k->p, i).offset,
-					g.v.gens[i])) {
-				g.v.gens[i] = 0;
-				need_update = true;
-			}
-		}
-
-		if (need_update) {
-			struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
-
-			ret = PTR_ERR_OR_ZERO(u);
-			if (ret)
-				goto err;
-
-			memcpy(u, &g, sizeof(g));
-
-			ret = bch2_trans_update(trans, bucket_gens_iter, u, 0);
-			if (ret)
-				goto err;
-		}
-	}
-
-	*end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0));
-err:
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans,
-					      struct btree_iter *iter)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter alloc_iter;
-	struct bkey_s_c alloc_k;
-	struct bch_alloc_v4 a_convert;
-	const struct bch_alloc_v4 *a;
-	u64 genbits;
-	struct bpos pos;
-	enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
-		? BCH_DATA_need_discard
-		: BCH_DATA_free;
-	struct printbuf buf = PRINTBUF;
-	int ret;
-
-	pos = iter->pos;
-	pos.offset &= ~(~0ULL << 56);
-	genbits = iter->pos.offset & (~0ULL << 56);
-
-	alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
-	ret = bkey_err(alloc_k);
-	if (ret)
-		return ret;
-
-	if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
-			need_discard_freespace_key_to_invalid_dev_bucket,
-			"entry in %s btree for nonexistant dev:bucket %llu:%llu",
-			bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset))
-		goto delete;
-
-	a = bch2_alloc_to_v4(alloc_k, &a_convert);
-
-	if (fsck_err_on(a->data_type != state ||
-			(state == BCH_DATA_free &&
-			 genbits != alloc_freespace_genbits(*a)), c,
-			need_discard_freespace_key_bad,
-			"%s\n  incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
-			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
-			bch2_btree_id_str(iter->btree_id),
-			iter->pos.inode,
-			iter->pos.offset,
-			a->data_type == state,
-			genbits >> 56, alloc_freespace_genbits(*a) >> 56))
-		goto delete;
-out:
-fsck_err:
-	bch2_set_btree_iter_dontneed(&alloc_iter);
-	bch2_trans_iter_exit(trans, &alloc_iter);
-	printbuf_exit(&buf);
-	return ret;
-delete:
-	ret =   bch2_btree_delete_extent_at(trans, iter,
-			iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
-		bch2_trans_commit(trans, NULL, NULL,
-			BCH_TRANS_COMMIT_no_enospc);
-	goto out;
-}
-
-/*
- * We've already checked that generation numbers in the bucket_gens btree are
- * valid for buckets that exist; this just checks for keys for nonexistent
- * buckets.
- */
-static noinline_for_stack
-int bch2_check_bucket_gens_key(struct btree_trans *trans,
-			       struct btree_iter *iter,
-			       struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_i_bucket_gens g;
-	u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
-	u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
-	u64 b;
-	bool need_update = false;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	BUG_ON(k.k->type != KEY_TYPE_bucket_gens);
-	bkey_reassemble(&g.k_i, k);
-
-	struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode);
-	if (!ca) {
-		if (fsck_err(c, bucket_gens_to_invalid_dev,
-			     "bucket_gens key for invalid device:\n  %s",
-			     (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-			ret = bch2_btree_delete_at(trans, iter, 0);
-		goto out;
-	}
-
-	if (fsck_err_on(end <= ca->mi.first_bucket ||
-			start >= ca->mi.nbuckets, c,
-			bucket_gens_to_invalid_buckets,
-			"bucket_gens key for invalid buckets:\n  %s",
-			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		ret = bch2_btree_delete_at(trans, iter, 0);
-		goto out;
-	}
-
-	for (b = start; b < ca->mi.first_bucket; b++)
-		if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
-				bucket_gens_nonzero_for_invalid_buckets,
-				"bucket_gens key has nonzero gen for invalid bucket")) {
-			g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
-			need_update = true;
-		}
-
-	for (b = ca->mi.nbuckets; b < end; b++)
-		if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
-				bucket_gens_nonzero_for_invalid_buckets,
-				"bucket_gens key has nonzero gen for invalid bucket")) {
-			g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
-			need_update = true;
-		}
-
-	if (need_update) {
-		struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
-
-		ret = PTR_ERR_OR_ZERO(u);
-		if (ret)
-			goto out;
-
-		memcpy(u, &g, sizeof(g));
-		ret = bch2_trans_update(trans, iter, u, 0);
-	}
-out:
-fsck_err:
-	bch2_dev_put(ca);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-int bch2_check_alloc_info(struct bch_fs *c)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
-	struct bch_dev *ca = NULL;
-	struct bkey hole;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
-			     BTREE_ITER_prefetch);
-	bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
-			     BTREE_ITER_prefetch);
-	bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
-			     BTREE_ITER_prefetch);
-	bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
-			     BTREE_ITER_prefetch);
-
-	while (1) {
-		struct bpos next;
-
-		bch2_trans_begin(trans);
-
-		k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole);
-		ret = bkey_err(k);
-		if (ret)
-			goto bkey_err;
-
-		if (!k.k)
-			break;
-
-		if (k.k->type) {
-			next = bpos_nosnap_successor(k.k->p);
-
-			ret = bch2_check_alloc_key(trans,
-						   k, &iter,
-						   &discard_iter,
-						   &freespace_iter,
-						   &bucket_gens_iter);
-			if (ret)
-				goto bkey_err;
-		} else {
-			next = k.k->p;
-
-			ret = bch2_check_alloc_hole_freespace(trans, ca,
-						    bkey_start_pos(k.k),
-						    &next,
-						    &freespace_iter) ?:
-				bch2_check_alloc_hole_bucket_gens(trans,
-						    bkey_start_pos(k.k),
-						    &next,
-						    &bucket_gens_iter);
-			if (ret)
-				goto bkey_err;
-		}
-
-		ret = bch2_trans_commit(trans, NULL, NULL,
-					BCH_TRANS_COMMIT_no_enospc);
-		if (ret)
-			goto bkey_err;
-
-		bch2_btree_iter_set_pos(&iter, next);
-bkey_err:
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			continue;
-		if (ret)
-			break;
-	}
-	bch2_trans_iter_exit(trans, &bucket_gens_iter);
-	bch2_trans_iter_exit(trans, &freespace_iter);
-	bch2_trans_iter_exit(trans, &discard_iter);
-	bch2_trans_iter_exit(trans, &iter);
-	bch2_dev_put(ca);
-	ca = NULL;
-
-	if (ret < 0)
-		goto err;
-
-	ret = for_each_btree_key(trans, iter,
-			BTREE_ID_need_discard, POS_MIN,
-			BTREE_ITER_prefetch, k,
-		bch2_check_discard_freespace_key(trans, &iter));
-	if (ret)
-		goto err;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN,
-			     BTREE_ITER_prefetch);
-	while (1) {
-		bch2_trans_begin(trans);
-		k = bch2_btree_iter_peek(&iter);
-		if (!k.k)
-			break;
-
-		ret = bkey_err(k) ?:
-			bch2_check_discard_freespace_key(trans, &iter);
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-			ret = 0;
-			continue;
-		}
-		if (ret) {
-			struct printbuf buf = PRINTBUF;
-			bch2_bkey_val_to_text(&buf, c, k);
-
-			bch_err(c, "while checking %s", buf.buf);
-			printbuf_exit(&buf);
-			break;
-		}
-
-		bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
-	}
-	bch2_trans_iter_exit(trans, &iter);
-	if (ret)
-		goto err;
-
-	ret = for_each_btree_key_commit(trans, iter,
-			BTREE_ID_bucket_gens, POS_MIN,
-			BTREE_ITER_prefetch, k,
-			NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-		bch2_check_bucket_gens_key(trans, &iter, k));
-err:
-	bch2_trans_put(trans);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
-				       struct btree_iter *alloc_iter)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter lru_iter;
-	struct bch_alloc_v4 a_convert;
-	const struct bch_alloc_v4 *a;
-	struct bkey_s_c alloc_k, lru_k;
-	struct printbuf buf = PRINTBUF;
-	int ret;
-
-	alloc_k = bch2_btree_iter_peek(alloc_iter);
-	if (!alloc_k.k)
-		return 0;
-
-	ret = bkey_err(alloc_k);
-	if (ret)
-		return ret;
-
-	a = bch2_alloc_to_v4(alloc_k, &a_convert);
-
-	if (a->data_type != BCH_DATA_cached)
-		return 0;
-
-	if (fsck_err_on(!a->io_time[READ], c,
-			alloc_key_cached_but_read_time_zero,
-			"cached bucket with read_time 0\n"
-			"  %s",
-		(printbuf_reset(&buf),
-		 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
-		struct bkey_i_alloc_v4 *a_mut =
-			bch2_alloc_to_v4_mut(trans, alloc_k);
-		ret = PTR_ERR_OR_ZERO(a_mut);
-		if (ret)
-			goto err;
-
-		a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
-		ret = bch2_trans_update(trans, alloc_iter,
-					&a_mut->k_i, BTREE_TRIGGER_norun);
-		if (ret)
-			goto err;
-
-		a = &a_mut->v;
-	}
-
-	lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
-			     lru_pos(alloc_k.k->p.inode,
-				     bucket_to_u64(alloc_k.k->p),
-				     a->io_time[READ]), 0);
-	ret = bkey_err(lru_k);
-	if (ret)
-		return ret;
-
-	if (fsck_err_on(lru_k.k->type != KEY_TYPE_set, c,
-			alloc_key_to_missing_lru_entry,
-			"missing lru entry\n"
-			"  %s",
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
-		ret = bch2_lru_set(trans,
-				   alloc_k.k->p.inode,
-				   bucket_to_u64(alloc_k.k->p),
-				   a->io_time[READ]);
-		if (ret)
-			goto err;
-	}
-err:
-fsck_err:
-	bch2_trans_iter_exit(trans, &lru_iter);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
-{
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
-				POS_MIN, BTREE_ITER_prefetch, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			bch2_check_alloc_to_lru_ref(trans, &iter)));
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket)
-{
-	int ret;
-
-	mutex_lock(&c->discard_buckets_in_flight_lock);
-	darray_for_each(c->discard_buckets_in_flight, i)
-		if (bkey_eq(*i, bucket)) {
-			ret = -EEXIST;
-			goto out;
-		}
-
-	ret = darray_push(&c->discard_buckets_in_flight, bucket);
-out:
-	mutex_unlock(&c->discard_buckets_in_flight_lock);
-	return ret;
-}
-
-static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket)
-{
-	mutex_lock(&c->discard_buckets_in_flight_lock);
-	darray_for_each(c->discard_buckets_in_flight, i)
-		if (bkey_eq(*i, bucket)) {
-			darray_remove_item(&c->discard_buckets_in_flight, i);
-			goto found;
-		}
-	BUG();
-found:
-	mutex_unlock(&c->discard_buckets_in_flight_lock);
-}
-
-struct discard_buckets_state {
-	u64		seen;
-	u64		open;
-	u64		need_journal_commit;
-	u64		discarded;
-	struct bch_dev	*ca;
-	u64		need_journal_commit_this_dev;
-};
-
-static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca)
-{
-	if (s->ca == ca)
-		return;
-
-	if (s->ca && s->need_journal_commit_this_dev >
-	    bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets)
-		bch2_journal_flush_async(&c->journal, NULL);
-
-	if (s->ca)
-		percpu_ref_put(&s->ca->io_ref);
-	s->ca = ca;
-	s->need_journal_commit_this_dev = 0;
-}
-
-static int bch2_discard_one_bucket(struct btree_trans *trans,
-				   struct btree_iter *need_discard_iter,
-				   struct bpos *discard_pos_done,
-				   struct discard_buckets_state *s)
-{
-	struct bch_fs *c = trans->c;
-	struct bpos pos = need_discard_iter->pos;
-	struct btree_iter iter = { NULL };
-	struct bkey_s_c k;
-	struct bkey_i_alloc_v4 *a;
-	struct printbuf buf = PRINTBUF;
-	bool discard_locked = false;
-	int ret = 0;
-
-	struct bch_dev *ca = s->ca && s->ca->dev_idx == pos.inode
-		? s->ca
-		: bch2_dev_get_ioref(c, pos.inode, WRITE);
-	if (!ca) {
-		bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
-		return 0;
-	}
-
-	discard_buckets_next_dev(c, s, ca);
-
-	if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
-		s->open++;
-		goto out;
-	}
-
-	if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
-			c->journal.flushed_seq_ondisk,
-			pos.inode, pos.offset)) {
-		s->need_journal_commit++;
-		s->need_journal_commit_this_dev++;
-		goto out;
-	}
-
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
-			       need_discard_iter->pos,
-			       BTREE_ITER_cached);
-	ret = bkey_err(k);
-	if (ret)
-		goto out;
-
-	a = bch2_alloc_to_v4_mut(trans, k);
-	ret = PTR_ERR_OR_ZERO(a);
-	if (ret)
-		goto out;
-
-	if (bch2_bucket_sectors_total(a->v)) {
-		if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
-					       trans, "attempting to discard bucket with dirty data\n%s",
-					       (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-			ret = -EIO;
-		goto out;
-	}
-
-	if (a->v.data_type != BCH_DATA_need_discard) {
-		if (data_type_is_empty(a->v.data_type) &&
-		    BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
-			a->v.gen++;
-			SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
-			goto write;
-		}
-
-		if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
-					       trans, "bucket incorrectly set in need_discard btree\n"
-					       "%s",
-					       (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-			ret = -EIO;
-		goto out;
-	}
-
-	if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
-		if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
-					       trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s",
-					       a->v.journal_seq,
-					       c->journal.flushed_seq_ondisk,
-					       (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-			ret = -EIO;
-		goto out;
-	}
-
-	if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true)))
-		goto out;
-
-	discard_locked = true;
-
-	if (!bkey_eq(*discard_pos_done, iter.pos) &&
-	    ca->mi.discard && !c->opts.nochanges) {
-		/*
-		 * This works without any other locks because this is the only
-		 * thread that removes items from the need_discard tree
-		 */
-		bch2_trans_unlock_long(trans);
-		blkdev_issue_discard(ca->disk_sb.bdev,
-				     k.k->p.offset * ca->mi.bucket_size,
-				     ca->mi.bucket_size,
-				     GFP_KERNEL);
-		*discard_pos_done = iter.pos;
-
-		ret = bch2_trans_relock_notrace(trans);
-		if (ret)
-			goto out;
-	}
-
-	SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
-	alloc_data_type_set(&a->v, a->v.data_type);
-write:
-	ret =   bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
-		bch2_trans_commit(trans, NULL, NULL,
-				  BCH_WATERMARK_btree|
-				  BCH_TRANS_COMMIT_no_enospc);
-	if (ret)
-		goto out;
-
-	count_event(c, bucket_discard);
-	s->discarded++;
-out:
-	if (discard_locked)
-		discard_in_flight_remove(c, iter.pos);
-	s->seen++;
-	bch2_trans_iter_exit(trans, &iter);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static void bch2_do_discards_work(struct work_struct *work)
-{
-	struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
-	struct discard_buckets_state s = {};
-	struct bpos discard_pos_done = POS_MAX;
-	int ret;
-
-	/*
-	 * We're doing the commit in bch2_discard_one_bucket instead of using
-	 * for_each_btree_key_commit() so that we can increment counters after
-	 * successful commit:
-	 */
-	ret = bch2_trans_run(c,
-		for_each_btree_key(trans, iter,
-				   BTREE_ID_need_discard, POS_MIN, 0, k,
-			bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s)));
-
-	discard_buckets_next_dev(c, &s, NULL);
-
-	trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
-			      bch2_err_str(ret));
-
-	bch2_write_ref_put(c, BCH_WRITE_REF_discard);
-}
-
-void bch2_do_discards(struct bch_fs *c)
-{
-	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) &&
-	    !queue_work(c->write_ref_wq, &c->discard_work))
-		bch2_write_ref_put(c, BCH_WRITE_REF_discard);
-}
-
-static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket)
-{
-	struct btree_iter iter;
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_intent);
-	struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
-	int ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k);
-	ret = PTR_ERR_OR_ZERO(a);
-	if (ret)
-		goto err;
-
-	BUG_ON(a->v.dirty_sectors);
-	SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
-	alloc_data_type_set(&a->v, a->v.data_type);
-
-	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static void bch2_do_discards_fast_work(struct work_struct *work)
-{
-	struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work);
-
-	while (1) {
-		bool got_bucket = false;
-		struct bpos bucket;
-		struct bch_dev *ca;
-
-		mutex_lock(&c->discard_buckets_in_flight_lock);
-		darray_for_each(c->discard_buckets_in_flight, i) {
-			if (i->snapshot)
-				continue;
-
-			ca = bch2_dev_get_ioref(c, i->inode, WRITE);
-			if (!ca) {
-				darray_remove_item(&c->discard_buckets_in_flight, i);
-				continue;
-			}
-
-			got_bucket = true;
-			bucket = *i;
-			i->snapshot = true;
-			break;
-		}
-		mutex_unlock(&c->discard_buckets_in_flight_lock);
-
-		if (!got_bucket)
-			break;
-
-		if (ca->mi.discard && !c->opts.nochanges)
-			blkdev_issue_discard(ca->disk_sb.bdev,
-					     bucket.offset * ca->mi.bucket_size,
-					     ca->mi.bucket_size,
-					     GFP_KERNEL);
-
-		int ret = bch2_trans_do(c, NULL, NULL,
-					BCH_WATERMARK_btree|
-					BCH_TRANS_COMMIT_no_enospc,
-					bch2_clear_bucket_needs_discard(trans, bucket));
-		bch_err_fn(c, ret);
-
-		percpu_ref_put(&ca->io_ref);
-		discard_in_flight_remove(c, bucket);
-
-		if (ret)
-			break;
-	}
-
-	bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
-}
-
-static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket)
-{
-	rcu_read_lock();
-	struct bch_dev *ca = bch2_dev_rcu(c, bucket.inode);
-	bool dead = !ca || percpu_ref_is_dying(&ca->io_ref);
-	rcu_read_unlock();
-
-	if (!dead &&
-	    !discard_in_flight_add(c, bucket) &&
-	    bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) &&
-	    !queue_work(c->write_ref_wq, &c->discard_fast_work))
-		bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
-}
-
-static int invalidate_one_bucket(struct btree_trans *trans,
-				 struct btree_iter *lru_iter,
-				 struct bkey_s_c lru_k,
-				 s64 *nr_to_invalidate)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_i_alloc_v4 *a = NULL;
-	struct printbuf buf = PRINTBUF;
-	struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
-	unsigned cached_sectors;
-	int ret = 0;
-
-	if (*nr_to_invalidate <= 0)
-		return 1;
-
-	if (!bch2_dev_bucket_exists(c, bucket)) {
-		prt_str(&buf, "lru entry points to invalid bucket");
-		goto err;
-	}
-
-	if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
-		return 0;
-
-	a = bch2_trans_start_alloc_update(trans, bucket);
-	ret = PTR_ERR_OR_ZERO(a);
-	if (ret)
-		goto out;
-
-	/* We expect harmless races here due to the btree write buffer: */
-	if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v))
-		goto out;
-
-	BUG_ON(a->v.data_type != BCH_DATA_cached);
-	BUG_ON(a->v.dirty_sectors);
-
-	if (!a->v.cached_sectors)
-		bch_err(c, "invalidating empty bucket, confused");
-
-	cached_sectors = a->v.cached_sectors;
-
-	SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
-	a->v.gen++;
-	a->v.data_type		= 0;
-	a->v.dirty_sectors	= 0;
-	a->v.cached_sectors	= 0;
-	a->v.io_time[READ]	= atomic64_read(&c->io_clock[READ].now);
-	a->v.io_time[WRITE]	= atomic64_read(&c->io_clock[WRITE].now);
-
-	ret = bch2_trans_commit(trans, NULL, NULL,
-				BCH_WATERMARK_btree|
-				BCH_TRANS_COMMIT_no_enospc);
-	if (ret)
-		goto out;
-
-	trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
-	--*nr_to_invalidate;
-out:
-	printbuf_exit(&buf);
-	return ret;
-err:
-	prt_str(&buf, "\n  lru key: ");
-	bch2_bkey_val_to_text(&buf, c, lru_k);
-
-	prt_str(&buf, "\n  lru entry: ");
-	bch2_lru_pos_to_text(&buf, lru_iter->pos);
-
-	prt_str(&buf, "\n  alloc key: ");
-	if (!a)
-		bch2_bpos_to_text(&buf, bucket);
-	else
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
-
-	bch_err(c, "%s", buf.buf);
-	if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) {
-		bch2_inconsistent_error(c);
-		ret = -EINVAL;
-	}
-
-	goto out;
-}
-
-static void bch2_do_invalidates_work(struct work_struct *work)
-{
-	struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
-	struct btree_trans *trans = bch2_trans_get(c);
-	int ret = 0;
-
-	ret = bch2_btree_write_buffer_tryflush(trans);
-	if (ret)
-		goto err;
-
-	for_each_member_device(c, ca) {
-		s64 nr_to_invalidate =
-			should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
-
-		ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,
-				lru_pos(ca->dev_idx, 0, 0),
-				lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
-				BTREE_ITER_intent, k,
-			invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate));
-
-		if (ret < 0) {
-			bch2_dev_put(ca);
-			break;
-		}
-	}
-err:
-	bch2_trans_put(trans);
-	bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
-}
-
-void bch2_do_invalidates(struct bch_fs *c)
-{
-	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) &&
-	    !queue_work(c->write_ref_wq, &c->invalidate_work))
-		bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
-}
-
-int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
-			    u64 bucket_start, u64 bucket_end)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey hole;
-	struct bpos end = POS(ca->dev_idx, bucket_end);
-	struct bch_member *m;
-	unsigned long last_updated = jiffies;
-	int ret;
-
-	BUG_ON(bucket_start > bucket_end);
-	BUG_ON(bucket_end > ca->mi.nbuckets);
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
-		POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)),
-		BTREE_ITER_prefetch);
-	/*
-	 * Scan the alloc btree for every bucket on @ca, and add buckets to the
-	 * freespace/need_discard/need_gc_gens btrees as needed:
-	 */
-	while (1) {
-		if (last_updated + HZ * 10 < jiffies) {
-			bch_info(ca, "%s: currently at %llu/%llu",
-				 __func__, iter.pos.offset, ca->mi.nbuckets);
-			last_updated = jiffies;
-		}
-
-		bch2_trans_begin(trans);
-
-		if (bkey_ge(iter.pos, end)) {
-			ret = 0;
-			break;
-		}
-
-		k = bch2_get_key_or_hole(&iter, end, &hole);
-		ret = bkey_err(k);
-		if (ret)
-			goto bkey_err;
-
-		if (k.k->type) {
-			/*
-			 * We process live keys in the alloc btree one at a
-			 * time:
-			 */
-			struct bch_alloc_v4 a_convert;
-			const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
-
-			ret =   bch2_bucket_do_index(trans, ca, k, a, true) ?:
-				bch2_trans_commit(trans, NULL, NULL,
-						  BCH_TRANS_COMMIT_no_enospc);
-			if (ret)
-				goto bkey_err;
-
-			bch2_btree_iter_advance(&iter);
-		} else {
-			struct bkey_i *freespace;
-
-			freespace = bch2_trans_kmalloc(trans, sizeof(*freespace));
-			ret = PTR_ERR_OR_ZERO(freespace);
-			if (ret)
-				goto bkey_err;
-
-			bkey_init(&freespace->k);
-			freespace->k.type	= KEY_TYPE_set;
-			freespace->k.p		= k.k->p;
-			freespace->k.size	= k.k->size;
-
-			ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
-				bch2_trans_commit(trans, NULL, NULL,
-						  BCH_TRANS_COMMIT_no_enospc);
-			if (ret)
-				goto bkey_err;
-
-			bch2_btree_iter_set_pos(&iter, k.k->p);
-		}
-bkey_err:
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			continue;
-		if (ret)
-			break;
-	}
-
-	bch2_trans_iter_exit(trans, &iter);
-	bch2_trans_put(trans);
-
-	if (ret < 0) {
-		bch_err_msg(ca, ret, "initializing free space");
-		return ret;
-	}
-
-	mutex_lock(&c->sb_lock);
-	m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-	SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
-	mutex_unlock(&c->sb_lock);
-
-	return 0;
-}
-
-int bch2_fs_freespace_init(struct bch_fs *c)
-{
-	int ret = 0;
-	bool doing_init = false;
-
-	/*
-	 * We can crash during the device add path, so we need to check this on
-	 * every mount:
-	 */
-
-	for_each_member_device(c, ca) {
-		if (ca->mi.freespace_initialized)
-			continue;
-
-		if (!doing_init) {
-			bch_info(c, "initializing freespace");
-			doing_init = true;
-		}
-
-		ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
-		if (ret) {
-			bch2_dev_put(ca);
-			bch_err_fn(c, ret);
-			return ret;
-		}
-	}
-
-	if (doing_init) {
-		mutex_lock(&c->sb_lock);
-		bch2_write_super(c);
-		mutex_unlock(&c->sb_lock);
-		bch_verbose(c, "done initializing freespace");
-	}
-
-	return 0;
-}
-
-/* Bucket IO clocks: */
-
-int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
-			      size_t bucket_nr, int rw)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_i_alloc_v4 *a;
-	u64 now;
-	int ret = 0;
-
-	if (bch2_trans_relock(trans))
-		bch2_trans_begin(trans);
-
-	a = bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr));
-	ret = PTR_ERR_OR_ZERO(a);
-	if (ret)
-		return ret;
-
-	now = atomic64_read(&c->io_clock[rw].now);
-	if (a->v.io_time[rw] == now)
-		goto out;
-
-	a->v.io_time[rw] = now;
-
-	ret   = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
-		bch2_trans_commit(trans, NULL, NULL, 0);
-out:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-/* Startup/shutdown (ro/rw): */
-
-void bch2_recalc_capacity(struct bch_fs *c)
-{
-	u64 capacity = 0, reserved_sectors = 0, gc_reserve;
-	unsigned bucket_size_max = 0;
-	unsigned long ra_pages = 0;
-
-	lockdep_assert_held(&c->state_lock);
-
-	for_each_online_member(c, ca) {
-		struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
-
-		ra_pages += bdi->ra_pages;
-	}
-
-	bch2_set_ra_pages(c, ra_pages);
-
-	for_each_rw_member(c, ca) {
-		u64 dev_reserve = 0;
-
-		/*
-		 * We need to reserve buckets (from the number
-		 * of currently available buckets) against
-		 * foreground writes so that mainly copygc can
-		 * make forward progress.
-		 *
-		 * We need enough to refill the various reserves
-		 * from scratch - copygc will use its entire
-		 * reserve all at once, then run against when
-		 * its reserve is refilled (from the formerly
-		 * available buckets).
-		 *
-		 * This reserve is just used when considering if
-		 * allocations for foreground writes must wait -
-		 * not -ENOSPC calculations.
-		 */
-
-		dev_reserve += ca->nr_btree_reserve * 2;
-		dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
-
-		dev_reserve += 1;	/* btree write point */
-		dev_reserve += 1;	/* copygc write point */
-		dev_reserve += 1;	/* rebalance write point */
-
-		dev_reserve *= ca->mi.bucket_size;
-
-		capacity += bucket_to_sector(ca, ca->mi.nbuckets -
-					     ca->mi.first_bucket);
-
-		reserved_sectors += dev_reserve * 2;
-
-		bucket_size_max = max_t(unsigned, bucket_size_max,
-					ca->mi.bucket_size);
-	}
-
-	gc_reserve = c->opts.gc_reserve_bytes
-		? c->opts.gc_reserve_bytes >> 9
-		: div64_u64(capacity * c->opts.gc_reserve_percent, 100);
-
-	reserved_sectors = max(gc_reserve, reserved_sectors);
-
-	reserved_sectors = min(reserved_sectors, capacity);
-
-	c->capacity = capacity - reserved_sectors;
-
-	c->bucket_size_max = bucket_size_max;
-
-	/* Wake up case someone was waiting for buckets */
-	closure_wake_up(&c->freelist_wait);
-}
-
-u64 bch2_min_rw_member_capacity(struct bch_fs *c)
-{
-	u64 ret = U64_MAX;
-
-	for_each_rw_member(c, ca)
-		ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
-	return ret;
-}
-
-static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
-{
-	struct open_bucket *ob;
-	bool ret = false;
-
-	for (ob = c->open_buckets;
-	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
-	     ob++) {
-		spin_lock(&ob->lock);
-		if (ob->valid && !ob->on_partial_list &&
-		    ob->dev == ca->dev_idx)
-			ret = true;
-		spin_unlock(&ob->lock);
-	}
-
-	return ret;
-}
-
-/* device goes ro: */
-void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
-{
-	unsigned i;
-
-	/* First, remove device from allocation groups: */
-
-	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
-		clear_bit(ca->dev_idx, c->rw_devs[i].d);
-
-	/*
-	 * Capacity is calculated based off of devices in allocation groups:
-	 */
-	bch2_recalc_capacity(c);
-
-	bch2_open_buckets_stop(c, ca, false);
-
-	/*
-	 * Wake up threads that were blocked on allocation, so they can notice
-	 * the device can no longer be removed and the capacity has changed:
-	 */
-	closure_wake_up(&c->freelist_wait);
-
-	/*
-	 * journal_res_get() can block waiting for free space in the journal -
-	 * it needs to notice there may not be devices to allocate from anymore:
-	 */
-	wake_up(&c->journal.wait);
-
-	/* Now wait for any in flight writes: */
-
-	closure_wait_event(&c->open_buckets_wait,
-			   !bch2_dev_has_open_write_point(c, ca));
-}
-
-/* device goes rw: */
-void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
-{
-	unsigned i;
-
-	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
-		if (ca->mi.data_allowed & (1 << i))
-			set_bit(ca->dev_idx, c->rw_devs[i].d);
-}
-
-void bch2_fs_allocator_background_exit(struct bch_fs *c)
-{
-	darray_exit(&c->discard_buckets_in_flight);
-}
-
-void bch2_fs_allocator_background_init(struct bch_fs *c)
-{
-	spin_lock_init(&c->freelist_lock);
-	mutex_init(&c->discard_buckets_in_flight_lock);
-	INIT_WORK(&c->discard_work, bch2_do_discards_work);
-	INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work);
-	INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
-}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
deleted file mode 100644
index ae31a94be6f9..000000000000
--- a/fs/bcachefs/alloc_background.h
+++ /dev/null
@@ -1,312 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ALLOC_BACKGROUND_H
-#define _BCACHEFS_ALLOC_BACKGROUND_H
-
-#include "bcachefs.h"
-#include "alloc_types.h"
-#include "buckets.h"
-#include "debug.h"
-#include "super.h"
-
-enum bch_validate_flags;
-
-/* How out of date a pointer gen is allowed to be: */
-#define BUCKET_GC_GEN_MAX	96U
-
-static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
-{
-	rcu_read_lock();
-	struct bch_dev *ca = bch2_dev_rcu(c, pos.inode);
-	bool ret = ca && bucket_valid(ca, pos.offset);
-	rcu_read_unlock();
-	return ret;
-}
-
-static inline u64 bucket_to_u64(struct bpos bucket)
-{
-	return (bucket.inode << 48) | bucket.offset;
-}
-
-static inline struct bpos u64_to_bucket(u64 bucket)
-{
-	return POS(bucket >> 48, bucket & ~(~0ULL << 48));
-}
-
-static inline u8 alloc_gc_gen(struct bch_alloc_v4 a)
-{
-	return a.gen - a.oldest_gen;
-}
-
-static inline void alloc_to_bucket(struct bucket *dst, struct bch_alloc_v4 src)
-{
-	dst->gen		= src.gen;
-	dst->data_type		= src.data_type;
-	dst->dirty_sectors	= src.dirty_sectors;
-	dst->cached_sectors	= src.cached_sectors;
-	dst->stripe		= src.stripe;
-}
-
-static inline void __bucket_m_to_alloc(struct bch_alloc_v4 *dst, struct bucket src)
-{
-	dst->gen		= src.gen;
-	dst->data_type		= src.data_type;
-	dst->dirty_sectors	= src.dirty_sectors;
-	dst->cached_sectors	= src.cached_sectors;
-	dst->stripe		= src.stripe;
-}
-
-static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b)
-{
-	struct bch_alloc_v4 ret = {};
-	__bucket_m_to_alloc(&ret, b);
-	return ret;
-}
-
-static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type)
-{
-	switch (data_type) {
-	case BCH_DATA_cached:
-	case BCH_DATA_stripe:
-		return BCH_DATA_user;
-	default:
-		return data_type;
-	}
-}
-
-static inline bool bucket_data_type_mismatch(enum bch_data_type bucket,
-					     enum bch_data_type ptr)
-{
-	return !data_type_is_empty(bucket) &&
-		bucket_data_type(bucket) != bucket_data_type(ptr);
-}
-
-static inline unsigned bch2_bucket_sectors_total(struct bch_alloc_v4 a)
-{
-	return a.dirty_sectors + a.cached_sectors;
-}
-
-static inline unsigned bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
-{
-	return a.dirty_sectors;
-}
-
-static inline unsigned bch2_bucket_sectors_fragmented(struct bch_dev *ca,
-						 struct bch_alloc_v4 a)
-{
-	int d = bch2_bucket_sectors_dirty(a);
-
-	return d ? max(0, ca->mi.bucket_size - d) : 0;
-}
-
-static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
-						 enum bch_data_type data_type)
-{
-	if (a.stripe)
-		return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
-	if (a.dirty_sectors)
-		return data_type;
-	if (a.cached_sectors)
-		return BCH_DATA_cached;
-	if (BCH_ALLOC_V4_NEED_DISCARD(&a))
-		return BCH_DATA_need_discard;
-	if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
-		return BCH_DATA_need_gc_gens;
-	return BCH_DATA_free;
-}
-
-static inline void alloc_data_type_set(struct bch_alloc_v4 *a, enum bch_data_type data_type)
-{
-	a->data_type = alloc_data_type(*a, data_type);
-}
-
-static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a)
-{
-	return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0;
-}
-
-#define DATA_TYPES_MOVABLE		\
-	((1U << BCH_DATA_btree)|	\
-	 (1U << BCH_DATA_user)|		\
-	 (1U << BCH_DATA_stripe))
-
-static inline bool data_type_movable(enum bch_data_type type)
-{
-	return (1U << type) & DATA_TYPES_MOVABLE;
-}
-
-static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
-					      struct bch_dev *ca)
-{
-	if (!data_type_movable(a.data_type) ||
-	    !bch2_bucket_sectors_fragmented(ca, a))
-		return 0;
-
-	u64 d = bch2_bucket_sectors_dirty(a);
-	return div_u64(d * (1ULL << 31), ca->mi.bucket_size);
-}
-
-static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
-{
-	return ((u64) alloc_gc_gen(a) >> 4) << 56;
-}
-
-static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a)
-{
-	pos.offset |= alloc_freespace_genbits(a);
-	return pos;
-}
-
-static inline unsigned alloc_v4_u64s_noerror(const struct bch_alloc_v4 *a)
-{
-	return (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
-			BCH_ALLOC_V4_U64s_V0) +
-		BCH_ALLOC_V4_NR_BACKPOINTERS(a) *
-		(sizeof(struct bch_backpointer) / sizeof(u64));
-}
-
-static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
-{
-	unsigned ret = alloc_v4_u64s_noerror(a);
-	BUG_ON(ret > U8_MAX - BKEY_U64s);
-	return ret;
-}
-
-static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a)
-{
-	set_bkey_val_u64s(&a->k, alloc_v4_u64s(&a->v));
-}
-
-struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update_noupdate(struct btree_trans *, struct btree_iter *, struct bpos);
-struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update(struct btree_trans *, struct bpos);
-
-void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
-
-static inline const struct bch_alloc_v4 *bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *convert)
-{
-	const struct bch_alloc_v4 *ret;
-
-	if (unlikely(k.k->type != KEY_TYPE_alloc_v4))
-		goto slowpath;
-
-	ret = bkey_s_c_to_alloc_v4(k).v;
-	if (BCH_ALLOC_V4_BACKPOINTERS_START(ret) != BCH_ALLOC_V4_U64s)
-		goto slowpath;
-
-	return ret;
-slowpath:
-	__bch2_alloc_to_v4(k, convert);
-	return convert;
-}
-
-struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c);
-
-int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
-
-int bch2_alloc_v1_invalid(struct bch_fs *, struct bkey_s_c,
-			  enum bch_validate_flags, struct printbuf *);
-int bch2_alloc_v2_invalid(struct bch_fs *, struct bkey_s_c,
-			  enum bch_validate_flags, struct printbuf *);
-int bch2_alloc_v3_invalid(struct bch_fs *, struct bkey_s_c,
-			  enum bch_validate_flags, struct printbuf *);
-int bch2_alloc_v4_invalid(struct bch_fs *, struct bkey_s_c,
-			  enum bch_validate_flags, struct printbuf *);
-void bch2_alloc_v4_swab(struct bkey_s);
-void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_alloc ((struct bkey_ops) {	\
-	.key_invalid	= bch2_alloc_v1_invalid,	\
-	.val_to_text	= bch2_alloc_to_text,		\
-	.trigger	= bch2_trigger_alloc,		\
-	.min_val_size	= 8,				\
-})
-
-#define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) {	\
-	.key_invalid	= bch2_alloc_v2_invalid,	\
-	.val_to_text	= bch2_alloc_to_text,		\
-	.trigger	= bch2_trigger_alloc,		\
-	.min_val_size	= 8,				\
-})
-
-#define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) {	\
-	.key_invalid	= bch2_alloc_v3_invalid,	\
-	.val_to_text	= bch2_alloc_to_text,		\
-	.trigger	= bch2_trigger_alloc,		\
-	.min_val_size	= 16,				\
-})
-
-#define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) {	\
-	.key_invalid	= bch2_alloc_v4_invalid,	\
-	.val_to_text	= bch2_alloc_to_text,		\
-	.swab		= bch2_alloc_v4_swab,		\
-	.trigger	= bch2_trigger_alloc,		\
-	.min_val_size	= 48,				\
-})
-
-int bch2_bucket_gens_invalid(struct bch_fs *, struct bkey_s_c,
-			     enum bch_validate_flags, struct printbuf *);
-void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) {	\
-	.key_invalid	= bch2_bucket_gens_invalid,	\
-	.val_to_text	= bch2_bucket_gens_to_text,	\
-})
-
-int bch2_bucket_gens_init(struct bch_fs *);
-
-static inline bool bkey_is_alloc(const struct bkey *k)
-{
-	return  k->type == KEY_TYPE_alloc ||
-		k->type == KEY_TYPE_alloc_v2 ||
-		k->type == KEY_TYPE_alloc_v3;
-}
-
-int bch2_alloc_read(struct bch_fs *);
-
-int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
-		       struct bkey_s_c, struct bkey_s,
-		       enum btree_iter_update_trigger_flags);
-int bch2_check_alloc_info(struct bch_fs *);
-int bch2_check_alloc_to_lru_refs(struct bch_fs *);
-void bch2_do_discards(struct bch_fs *);
-
-static inline u64 should_invalidate_buckets(struct bch_dev *ca,
-					    struct bch_dev_usage u)
-{
-	u64 want_free = ca->mi.nbuckets >> 7;
-	u64 free = max_t(s64, 0,
-			   u.d[BCH_DATA_free].buckets
-			 + u.d[BCH_DATA_need_discard].buckets
-			 - bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe));
-
-	return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
-}
-
-void bch2_do_invalidates(struct bch_fs *);
-
-static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a)
-{
-	return (void *) ((u64 *) &a->v +
-			 (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
-			  BCH_ALLOC_V4_U64s_V0));
-}
-
-static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a)
-{
-	return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
-}
-
-int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64);
-int bch2_fs_freespace_init(struct bch_fs *);
-
-void bch2_recalc_capacity(struct bch_fs *);
-u64 bch2_min_rw_member_capacity(struct bch_fs *);
-
-void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
-void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
-
-void bch2_fs_allocator_background_exit(struct bch_fs *);
-void bch2_fs_allocator_background_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h
deleted file mode 100644
index b4ec20be93b8..000000000000
--- a/fs/bcachefs/alloc_background_format.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
-#define _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
-
-struct bch_alloc {
-	struct bch_val		v;
-	__u8			fields;
-	__u8			gen;
-	__u8			data[];
-} __packed __aligned(8);
-
-#define BCH_ALLOC_FIELDS_V1()			\
-	x(read_time,		16)		\
-	x(write_time,		16)		\
-	x(data_type,		8)		\
-	x(dirty_sectors,	16)		\
-	x(cached_sectors,	16)		\
-	x(oldest_gen,		8)		\
-	x(stripe,		32)		\
-	x(stripe_redundancy,	8)
-
-enum {
-#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
-	BCH_ALLOC_FIELDS_V1()
-#undef x
-};
-
-struct bch_alloc_v2 {
-	struct bch_val		v;
-	__u8			nr_fields;
-	__u8			gen;
-	__u8			oldest_gen;
-	__u8			data_type;
-	__u8			data[];
-} __packed __aligned(8);
-
-#define BCH_ALLOC_FIELDS_V2()			\
-	x(read_time,		64)		\
-	x(write_time,		64)		\
-	x(dirty_sectors,	32)		\
-	x(cached_sectors,	32)		\
-	x(stripe,		32)		\
-	x(stripe_redundancy,	8)
-
-struct bch_alloc_v3 {
-	struct bch_val		v;
-	__le64			journal_seq;
-	__le32			flags;
-	__u8			nr_fields;
-	__u8			gen;
-	__u8			oldest_gen;
-	__u8			data_type;
-	__u8			data[];
-} __packed __aligned(8);
-
-LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags,  0,  1)
-LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags,  1,  2)
-
-struct bch_alloc_v4 {
-	struct bch_val		v;
-	__u64			journal_seq;
-	__u32			flags;
-	__u8			gen;
-	__u8			oldest_gen;
-	__u8			data_type;
-	__u8			stripe_redundancy;
-	__u32			dirty_sectors;
-	__u32			cached_sectors;
-	__u64			io_time[2];
-	__u32			stripe;
-	__u32			nr_external_backpointers;
-	__u64			fragmentation_lru;
-} __packed __aligned(8);
-
-#define BCH_ALLOC_V4_U64s_V0	6
-#define BCH_ALLOC_V4_U64s	(sizeof(struct bch_alloc_v4) / sizeof(__u64))
-
-BITMASK(BCH_ALLOC_V4_NEED_DISCARD,	struct bch_alloc_v4, flags,  0,  1)
-BITMASK(BCH_ALLOC_V4_NEED_INC_GEN,	struct bch_alloc_v4, flags,  1,  2)
-BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags,  2,  8)
-BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS,	struct bch_alloc_v4, flags,  8,  14)
-
-#define KEY_TYPE_BUCKET_GENS_BITS	8
-#define KEY_TYPE_BUCKET_GENS_NR		(1U << KEY_TYPE_BUCKET_GENS_BITS)
-#define KEY_TYPE_BUCKET_GENS_MASK	(KEY_TYPE_BUCKET_GENS_NR - 1)
-
-struct bch_bucket_gens {
-	struct bch_val		v;
-	u8			gens[KEY_TYPE_BUCKET_GENS_NR];
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
deleted file mode 100644
index 927a5f300b30..000000000000
--- a/fs/bcachefs/alloc_foreground.c
+++ /dev/null
@@ -1,1796 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright 2012 Google, Inc.
- *
- * Foreground allocator code: allocate buckets from freelist, and allocate in
- * sector granularity from writepoints.
- *
- * bch2_bucket_alloc() allocates a single bucket from a specific device.
- *
- * bch2_bucket_alloc_set() allocates one or more buckets from different devices
- * in a given filesystem.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "btree_gc.h"
-#include "buckets.h"
-#include "buckets_waiting_for_journal.h"
-#include "clock.h"
-#include "debug.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "error.h"
-#include "io_write.h"
-#include "journal.h"
-#include "movinggc.h"
-#include "nocow_locking.h"
-#include "trace.h"
-
-#include <linux/math64.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-
-static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans,
-					   struct mutex *lock)
-{
-	if (!mutex_trylock(lock)) {
-		bch2_trans_unlock(trans);
-		mutex_lock(lock);
-	}
-}
-
-const char * const bch2_watermarks[] = {
-#define x(t) #t,
-	BCH_WATERMARKS()
-#undef x
-	NULL
-};
-
-/*
- * Open buckets represent a bucket that's currently being allocated from.  They
- * serve two purposes:
- *
- *  - They track buckets that have been partially allocated, allowing for
- *    sub-bucket sized allocations - they're used by the sector allocator below
- *
- *  - They provide a reference to the buckets they own that mark and sweep GC
- *    can find, until the new allocation has a pointer to it inserted into the
- *    btree
- *
- * When allocating some space with the sector allocator, the allocation comes
- * with a reference to an open bucket - the caller is required to put that
- * reference _after_ doing the index update that makes its allocation reachable.
- */
-
-void bch2_reset_alloc_cursors(struct bch_fs *c)
-{
-	rcu_read_lock();
-	for_each_member_device_rcu(c, ca, NULL)
-		memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor));
-	rcu_read_unlock();
-}
-
-static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
-{
-	open_bucket_idx_t idx = ob - c->open_buckets;
-	open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
-
-	ob->hash = *slot;
-	*slot = idx;
-}
-
-static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
-{
-	open_bucket_idx_t idx = ob - c->open_buckets;
-	open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
-
-	while (*slot != idx) {
-		BUG_ON(!*slot);
-		slot = &c->open_buckets[*slot].hash;
-	}
-
-	*slot = ob->hash;
-	ob->hash = 0;
-}
-
-void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
-{
-	struct bch_dev *ca = ob_dev(c, ob);
-
-	if (ob->ec) {
-		ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
-		return;
-	}
-
-	percpu_down_read(&c->mark_lock);
-	spin_lock(&ob->lock);
-
-	ob->valid = false;
-	ob->data_type = 0;
-
-	spin_unlock(&ob->lock);
-	percpu_up_read(&c->mark_lock);
-
-	spin_lock(&c->freelist_lock);
-	bch2_open_bucket_hash_remove(c, ob);
-
-	ob->freelist = c->open_buckets_freelist;
-	c->open_buckets_freelist = ob - c->open_buckets;
-
-	c->open_buckets_nr_free++;
-	ca->nr_open_buckets--;
-	spin_unlock(&c->freelist_lock);
-
-	closure_wake_up(&c->open_buckets_wait);
-}
-
-void bch2_open_bucket_write_error(struct bch_fs *c,
-				  struct open_buckets *obs,
-				  unsigned dev)
-{
-	struct open_bucket *ob;
-	unsigned i;
-
-	open_bucket_for_each(c, obs, ob, i)
-		if (ob->dev == dev && ob->ec)
-			bch2_ec_bucket_cancel(c, ob);
-}
-
-static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
-{
-	struct open_bucket *ob;
-
-	BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
-
-	ob = c->open_buckets + c->open_buckets_freelist;
-	c->open_buckets_freelist = ob->freelist;
-	atomic_set(&ob->pin, 1);
-	ob->data_type = 0;
-
-	c->open_buckets_nr_free--;
-	return ob;
-}
-
-static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
-{
-	BUG_ON(c->open_buckets_partial_nr >=
-	       ARRAY_SIZE(c->open_buckets_partial));
-
-	spin_lock(&c->freelist_lock);
-	ob->on_partial_list = true;
-	c->open_buckets_partial[c->open_buckets_partial_nr++] =
-		ob - c->open_buckets;
-	spin_unlock(&c->freelist_lock);
-
-	closure_wake_up(&c->open_buckets_wait);
-	closure_wake_up(&c->freelist_wait);
-}
-
-/* _only_ for allocating the journal on a new device: */
-long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
-{
-	while (ca->new_fs_bucket_idx < ca->mi.nbuckets) {
-		u64 b = ca->new_fs_bucket_idx++;
-
-		if (!is_superblock_bucket(ca, b) &&
-		    (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)))
-			return b;
-	}
-
-	return -1;
-}
-
-static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
-{
-	switch (watermark) {
-	case BCH_WATERMARK_interior_updates:
-		return 0;
-	case BCH_WATERMARK_reclaim:
-		return OPEN_BUCKETS_COUNT / 6;
-	case BCH_WATERMARK_btree:
-	case BCH_WATERMARK_btree_copygc:
-		return OPEN_BUCKETS_COUNT / 4;
-	case BCH_WATERMARK_copygc:
-		return OPEN_BUCKETS_COUNT / 3;
-	default:
-		return OPEN_BUCKETS_COUNT / 2;
-	}
-}
-
-static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
-					      u64 bucket,
-					      enum bch_watermark watermark,
-					      const struct bch_alloc_v4 *a,
-					      struct bucket_alloc_state *s,
-					      struct closure *cl)
-{
-	struct open_bucket *ob;
-
-	if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
-		s->skipped_nouse++;
-		return NULL;
-	}
-
-	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
-		s->skipped_open++;
-		return NULL;
-	}
-
-	if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
-			c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
-		s->skipped_need_journal_commit++;
-		return NULL;
-	}
-
-	if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) {
-		s->skipped_nocow++;
-		return NULL;
-	}
-
-	spin_lock(&c->freelist_lock);
-
-	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) {
-		if (cl)
-			closure_wait(&c->open_buckets_wait, cl);
-
-		track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
-		spin_unlock(&c->freelist_lock);
-		return ERR_PTR(-BCH_ERR_open_buckets_empty);
-	}
-
-	/* Recheck under lock: */
-	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
-		spin_unlock(&c->freelist_lock);
-		s->skipped_open++;
-		return NULL;
-	}
-
-	ob = bch2_open_bucket_alloc(c);
-
-	spin_lock(&ob->lock);
-
-	ob->valid	= true;
-	ob->sectors_free = ca->mi.bucket_size;
-	ob->dev		= ca->dev_idx;
-	ob->gen		= a->gen;
-	ob->bucket	= bucket;
-	spin_unlock(&ob->lock);
-
-	ca->nr_open_buckets++;
-	bch2_open_bucket_hash_add(c, ob);
-
-	track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false);
-	track_event_change(&c->times[BCH_TIME_blocked_allocate], false);
-
-	spin_unlock(&c->freelist_lock);
-	return ob;
-}
-
-static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
-					    enum bch_watermark watermark, u64 free_entry,
-					    struct bucket_alloc_state *s,
-					    struct bkey_s_c freespace_k,
-					    struct closure *cl)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter = { NULL };
-	struct bkey_s_c k;
-	struct open_bucket *ob;
-	struct bch_alloc_v4 a_convert;
-	const struct bch_alloc_v4 *a;
-	u64 b = free_entry & ~(~0ULL << 56);
-	unsigned genbits = free_entry >> 56;
-	struct printbuf buf = PRINTBUF;
-	int ret;
-
-	if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
-		prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
-		       "  freespace key ",
-			ca->mi.first_bucket, ca->mi.nbuckets);
-		bch2_bkey_val_to_text(&buf, c, freespace_k);
-		bch2_trans_inconsistent(trans, "%s", buf.buf);
-		ob = ERR_PTR(-EIO);
-		goto err;
-	}
-
-	k = bch2_bkey_get_iter(trans, &iter,
-			       BTREE_ID_alloc, POS(ca->dev_idx, b),
-			       BTREE_ITER_cached);
-	ret = bkey_err(k);
-	if (ret) {
-		ob = ERR_PTR(ret);
-		goto err;
-	}
-
-	a = bch2_alloc_to_v4(k, &a_convert);
-
-	if (a->data_type != BCH_DATA_free) {
-		if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
-			ob = NULL;
-			goto err;
-		}
-
-		prt_printf(&buf, "non free bucket in freespace btree\n"
-		       "  freespace key ");
-		bch2_bkey_val_to_text(&buf, c, freespace_k);
-		prt_printf(&buf, "\n  ");
-		bch2_bkey_val_to_text(&buf, c, k);
-		bch2_trans_inconsistent(trans, "%s", buf.buf);
-		ob = ERR_PTR(-EIO);
-		goto err;
-	}
-
-	if (genbits != (alloc_freespace_genbits(*a) >> 56) &&
-	    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
-		prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
-		       "  freespace key ",
-		       genbits, alloc_freespace_genbits(*a) >> 56);
-		bch2_bkey_val_to_text(&buf, c, freespace_k);
-		prt_printf(&buf, "\n  ");
-		bch2_bkey_val_to_text(&buf, c, k);
-		bch2_trans_inconsistent(trans, "%s", buf.buf);
-		ob = ERR_PTR(-EIO);
-		goto err;
-	}
-
-	if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) {
-		struct bch_backpointer bp;
-		struct bpos bp_pos = POS_MIN;
-
-		ret = bch2_get_next_backpointer(trans, ca, POS(ca->dev_idx, b), -1,
-						&bp_pos, &bp,
-						BTREE_ITER_nopreserve);
-		if (ret) {
-			ob = ERR_PTR(ret);
-			goto err;
-		}
-
-		if (!bkey_eq(bp_pos, POS_MAX)) {
-			/*
-			 * Bucket may have data in it - we don't call
-			 * bc2h_trans_inconnsistent() because fsck hasn't
-			 * finished yet
-			 */
-			ob = NULL;
-			goto err;
-		}
-	}
-
-	ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl);
-	if (!ob)
-		bch2_set_btree_iter_dontneed(&iter);
-err:
-	if (iter.path)
-		bch2_set_btree_iter_dontneed(&iter);
-	bch2_trans_iter_exit(trans, &iter);
-	printbuf_exit(&buf);
-	return ob;
-}
-
-/*
- * This path is for before the freespace btree is initialized:
- *
- * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
- * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
- */
-static noinline struct open_bucket *
-bch2_bucket_alloc_early(struct btree_trans *trans,
-			struct bch_dev *ca,
-			enum bch_watermark watermark,
-			struct bucket_alloc_state *s,
-			struct closure *cl)
-{
-	struct btree_iter iter, citer;
-	struct bkey_s_c k, ck;
-	struct open_bucket *ob = NULL;
-	u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
-	u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap];
-	u64 alloc_start = max(first_bucket, *dev_alloc_cursor);
-	u64 alloc_cursor = alloc_start;
-	int ret;
-
-	/*
-	 * Scan with an uncached iterator to avoid polluting the key cache. An
-	 * uncached iter will return a cached key if one exists, but if not
-	 * there is no other underlying protection for the associated key cache
-	 * slot. To avoid racing bucket allocations, look up the cached key slot
-	 * of any likely allocation candidate before attempting to proceed with
-	 * the allocation. This provides proper exclusion on the associated
-	 * bucket.
-	 */
-again:
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
-			   BTREE_ITER_slots, k, ret) {
-		u64 bucket = k.k->p.offset;
-
-		if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
-			break;
-
-		if (ca->new_fs_bucket_idx &&
-		    is_superblock_bucket(ca, k.k->p.offset))
-			continue;
-
-		if (s->btree_bitmap != BTREE_BITMAP_ANY &&
-		    s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
-				bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
-			if (s->btree_bitmap == BTREE_BITMAP_YES &&
-			    bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
-				break;
-
-			bucket = sector_to_bucket(ca,
-					round_up(bucket_to_sector(ca, bucket) + 1,
-						 1ULL << ca->mi.btree_bitmap_shift));
-			bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, bucket));
-			s->buckets_seen++;
-			s->skipped_mi_btree_bitmap++;
-			continue;
-		}
-
-		struct bch_alloc_v4 a_convert;
-		const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
-		if (a->data_type != BCH_DATA_free)
-			continue;
-
-		/* now check the cached key to serialize concurrent allocs of the bucket */
-		ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_cached);
-		ret = bkey_err(ck);
-		if (ret)
-			break;
-
-		a = bch2_alloc_to_v4(ck, &a_convert);
-		if (a->data_type != BCH_DATA_free)
-			goto next;
-
-		s->buckets_seen++;
-
-		ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
-next:
-		bch2_set_btree_iter_dontneed(&citer);
-		bch2_trans_iter_exit(trans, &citer);
-		if (ob)
-			break;
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	alloc_cursor = iter.pos.offset;
-
-	if (!ob && ret)
-		ob = ERR_PTR(ret);
-
-	if (!ob && alloc_start > first_bucket) {
-		alloc_cursor = alloc_start = first_bucket;
-		goto again;
-	}
-
-	*dev_alloc_cursor = alloc_cursor;
-
-	return ob;
-}
-
-static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
-						   struct bch_dev *ca,
-						   enum bch_watermark watermark,
-						   struct bucket_alloc_state *s,
-						   struct closure *cl)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct open_bucket *ob = NULL;
-	u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap];
-	u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor));
-	u64 alloc_cursor = alloc_start;
-	int ret;
-
-	BUG_ON(ca->new_fs_bucket_idx);
-again:
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
-				     POS(ca->dev_idx, alloc_cursor), 0, k, ret) {
-		if (k.k->p.inode != ca->dev_idx)
-			break;
-
-		for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k));
-		     alloc_cursor < k.k->p.offset;
-		     alloc_cursor++) {
-			ret = btree_trans_too_many_iters(trans);
-			if (ret) {
-				ob = ERR_PTR(ret);
-				break;
-			}
-
-			s->buckets_seen++;
-
-			u64 bucket = alloc_cursor & ~(~0ULL << 56);
-			if (s->btree_bitmap != BTREE_BITMAP_ANY &&
-			    s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
-					bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
-				if (s->btree_bitmap == BTREE_BITMAP_YES &&
-				    bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
-					goto fail;
-
-				bucket = sector_to_bucket(ca,
-						round_up(bucket_to_sector(ca, bucket) + 1,
-							 1ULL << ca->mi.btree_bitmap_shift));
-				u64 genbits = alloc_cursor >> 56;
-				alloc_cursor = bucket | (genbits << 56);
-
-				if (alloc_cursor > k.k->p.offset)
-					bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor));
-				s->skipped_mi_btree_bitmap++;
-				continue;
-			}
-
-			ob = try_alloc_bucket(trans, ca, watermark,
-					      alloc_cursor, s, k, cl);
-			if (ob) {
-				bch2_set_btree_iter_dontneed(&iter);
-				break;
-			}
-		}
-
-		if (ob || ret)
-			break;
-	}
-fail:
-	bch2_trans_iter_exit(trans, &iter);
-
-	if (!ob && ret)
-		ob = ERR_PTR(ret);
-
-	if (!ob && alloc_start > ca->mi.first_bucket) {
-		alloc_cursor = alloc_start = ca->mi.first_bucket;
-		goto again;
-	}
-
-	*dev_alloc_cursor = alloc_cursor;
-
-	return ob;
-}
-
-static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca,
-					 enum bch_watermark watermark,
-					 enum bch_data_type data_type,
-					 struct closure *cl,
-					 struct bch_dev_usage *usage,
-					 struct bucket_alloc_state *s,
-					 struct open_bucket *ob)
-{
-	struct printbuf buf = PRINTBUF;
-
-	printbuf_tabstop_push(&buf, 24);
-
-	prt_printf(&buf, "dev\t%s (%u)\n",	ca->name, ca->dev_idx);
-	prt_printf(&buf, "watermark\t%s\n",	bch2_watermarks[watermark]);
-	prt_printf(&buf, "data type\t%s\n",	__bch2_data_types[data_type]);
-	prt_printf(&buf, "blocking\t%u\n",	cl != NULL);
-	prt_printf(&buf, "free\t%llu\n",	usage->d[BCH_DATA_free].buckets);
-	prt_printf(&buf, "avail\t%llu\n",	dev_buckets_free(ca, *usage, watermark));
-	prt_printf(&buf, "copygc_wait\t%lu/%lli\n",
-		   bch2_copygc_wait_amount(c),
-		   c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now));
-	prt_printf(&buf, "seen\t%llu\n",	s->buckets_seen);
-	prt_printf(&buf, "open\t%llu\n",	s->skipped_open);
-	prt_printf(&buf, "need journal commit\t%llu\n", s->skipped_need_journal_commit);
-	prt_printf(&buf, "nocow\t%llu\n",	s->skipped_nocow);
-	prt_printf(&buf, "nouse\t%llu\n",	s->skipped_nouse);
-	prt_printf(&buf, "mi_btree_bitmap\t%llu\n", s->skipped_mi_btree_bitmap);
-
-	if (!IS_ERR(ob)) {
-		prt_printf(&buf, "allocated\t%llu\n", ob->bucket);
-		trace_bucket_alloc(c, buf.buf);
-	} else {
-		prt_printf(&buf, "err\t%s\n", bch2_err_str(PTR_ERR(ob)));
-		trace_bucket_alloc_fail(c, buf.buf);
-	}
-
-	printbuf_exit(&buf);
-}
-
-/**
- * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
- * @trans:	transaction object
- * @ca:		device to allocate from
- * @watermark:	how important is this allocation?
- * @data_type:	BCH_DATA_journal, btree, user...
- * @cl:		if not NULL, closure to be used to wait if buckets not available
- * @usage:	for secondarily also returning the current device usage
- *
- * Returns:	an open_bucket on success, or an ERR_PTR() on failure.
- */
-static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
-				      struct bch_dev *ca,
-				      enum bch_watermark watermark,
-				      enum bch_data_type data_type,
-				      struct closure *cl,
-				      struct bch_dev_usage *usage)
-{
-	struct bch_fs *c = trans->c;
-	struct open_bucket *ob = NULL;
-	bool freespace = READ_ONCE(ca->mi.freespace_initialized);
-	u64 avail;
-	struct bucket_alloc_state s = {
-		.btree_bitmap = data_type == BCH_DATA_btree,
-	};
-	bool waiting = false;
-again:
-	bch2_dev_usage_read_fast(ca, usage);
-	avail = dev_buckets_free(ca, *usage, watermark);
-
-	if (usage->d[BCH_DATA_need_discard].buckets > avail)
-		bch2_do_discards(c);
-
-	if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
-		bch2_gc_gens_async(c);
-
-	if (should_invalidate_buckets(ca, *usage))
-		bch2_do_invalidates(c);
-
-	if (!avail) {
-		if (cl && !waiting) {
-			closure_wait(&c->freelist_wait, cl);
-			waiting = true;
-			goto again;
-		}
-
-		track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
-
-		ob = ERR_PTR(-BCH_ERR_freelist_empty);
-		goto err;
-	}
-
-	if (waiting)
-		closure_wake_up(&c->freelist_wait);
-alloc:
-	ob = likely(freespace)
-		? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl)
-		: bch2_bucket_alloc_early(trans, ca, watermark, &s, cl);
-
-	if (s.skipped_need_journal_commit * 2 > avail)
-		bch2_journal_flush_async(&c->journal, NULL);
-
-	if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) {
-		s.btree_bitmap = BTREE_BITMAP_ANY;
-		goto alloc;
-	}
-
-	if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
-		freespace = false;
-		goto alloc;
-	}
-err:
-	if (!ob)
-		ob = ERR_PTR(-BCH_ERR_no_buckets_found);
-
-	if (!IS_ERR(ob))
-		ob->data_type = data_type;
-
-	if (!IS_ERR(ob))
-		count_event(c, bucket_alloc);
-	else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
-		count_event(c, bucket_alloc_fail);
-
-	if (!IS_ERR(ob)
-	    ? trace_bucket_alloc_enabled()
-	    : trace_bucket_alloc_fail_enabled())
-		trace_bucket_alloc2(c, ca, watermark, data_type, cl, usage, &s, ob);
-
-	return ob;
-}
-
-struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
-				      enum bch_watermark watermark,
-				      enum bch_data_type data_type,
-				      struct closure *cl)
-{
-	struct bch_dev_usage usage;
-	struct open_bucket *ob;
-
-	bch2_trans_do(c, NULL, NULL, 0,
-		      PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
-							data_type, cl, &usage)));
-	return ob;
-}
-
-static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
-			    unsigned l, unsigned r)
-{
-	return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
-		(stripe->next_alloc[l] < stripe->next_alloc[r]));
-}
-
-#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
-
-struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
-					  struct dev_stripe_state *stripe,
-					  struct bch_devs_mask *devs)
-{
-	struct dev_alloc_list ret = { .nr = 0 };
-	unsigned i;
-
-	for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
-		ret.devs[ret.nr++] = i;
-
-	bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
-	return ret;
-}
-
-static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
-			       struct dev_stripe_state *stripe,
-			       struct bch_dev_usage *usage)
-{
-	u64 *v = stripe->next_alloc + ca->dev_idx;
-	u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal);
-	u64 free_space_inv = free_space
-		? div64_u64(1ULL << 48, free_space)
-		: 1ULL << 48;
-	u64 scale = *v / 4;
-
-	if (*v + free_space_inv >= *v)
-		*v += free_space_inv;
-	else
-		*v = U64_MAX;
-
-	for (v = stripe->next_alloc;
-	     v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
-		*v = *v < scale ? 0 : *v - scale;
-}
-
-void bch2_dev_stripe_increment(struct bch_dev *ca,
-			       struct dev_stripe_state *stripe)
-{
-	struct bch_dev_usage usage;
-
-	bch2_dev_usage_read_fast(ca, &usage);
-	bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
-}
-
-static int add_new_bucket(struct bch_fs *c,
-			   struct open_buckets *ptrs,
-			   struct bch_devs_mask *devs_may_alloc,
-			   unsigned nr_replicas,
-			   unsigned *nr_effective,
-			   bool *have_cache,
-			   unsigned flags,
-			   struct open_bucket *ob)
-{
-	unsigned durability = ob_dev(c, ob)->mi.durability;
-
-	BUG_ON(*nr_effective >= nr_replicas);
-
-	__clear_bit(ob->dev, devs_may_alloc->d);
-	*nr_effective	+= durability;
-	*have_cache	|= !durability;
-
-	ob_push(c, ptrs, ob);
-
-	if (*nr_effective >= nr_replicas)
-		return 1;
-	if (ob->ec)
-		return 1;
-	return 0;
-}
-
-int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
-		      struct open_buckets *ptrs,
-		      struct dev_stripe_state *stripe,
-		      struct bch_devs_mask *devs_may_alloc,
-		      unsigned nr_replicas,
-		      unsigned *nr_effective,
-		      bool *have_cache,
-		      unsigned flags,
-		      enum bch_data_type data_type,
-		      enum bch_watermark watermark,
-		      struct closure *cl)
-{
-	struct bch_fs *c = trans->c;
-	struct dev_alloc_list devs_sorted =
-		bch2_dev_alloc_list(c, stripe, devs_may_alloc);
-	int ret = -BCH_ERR_insufficient_devices;
-
-	BUG_ON(*nr_effective >= nr_replicas);
-
-	for (unsigned i = 0; i < devs_sorted.nr; i++) {
-		struct bch_dev_usage usage;
-		struct open_bucket *ob;
-
-		unsigned dev = devs_sorted.devs[i];
-		struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev);
-		if (!ca)
-			continue;
-
-		if (!ca->mi.durability && *have_cache) {
-			bch2_dev_put(ca);
-			continue;
-		}
-
-		ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, cl, &usage);
-		if (!IS_ERR(ob))
-			bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
-		bch2_dev_put(ca);
-
-		if (IS_ERR(ob)) {
-			ret = PTR_ERR(ob);
-			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl)
-				break;
-			continue;
-		}
-
-		if (add_new_bucket(c, ptrs, devs_may_alloc,
-				   nr_replicas, nr_effective,
-				   have_cache, flags, ob)) {
-			ret = 0;
-			break;
-		}
-	}
-
-	return ret;
-}
-
-/* Allocate from stripes: */
-
-/*
- * if we can't allocate a new stripe because there are already too many
- * partially filled stripes, force allocating from an existing stripe even when
- * it's to a device we don't want:
- */
-
-static int bucket_alloc_from_stripe(struct btree_trans *trans,
-			 struct open_buckets *ptrs,
-			 struct write_point *wp,
-			 struct bch_devs_mask *devs_may_alloc,
-			 u16 target,
-			 unsigned nr_replicas,
-			 unsigned *nr_effective,
-			 bool *have_cache,
-			 enum bch_watermark watermark,
-			 unsigned flags,
-			 struct closure *cl)
-{
-	struct bch_fs *c = trans->c;
-	struct dev_alloc_list devs_sorted;
-	struct ec_stripe_head *h;
-	struct open_bucket *ob;
-	unsigned i, ec_idx;
-	int ret = 0;
-
-	if (nr_replicas < 2)
-		return 0;
-
-	if (ec_open_bucket(c, ptrs))
-		return 0;
-
-	h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
-	if (IS_ERR(h))
-		return PTR_ERR(h);
-	if (!h)
-		return 0;
-
-	devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
-
-	for (i = 0; i < devs_sorted.nr; i++)
-		for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
-			if (!h->s->blocks[ec_idx])
-				continue;
-
-			ob = c->open_buckets + h->s->blocks[ec_idx];
-			if (ob->dev == devs_sorted.devs[i] &&
-			    !test_and_set_bit(ec_idx, h->s->blocks_allocated))
-				goto got_bucket;
-		}
-	goto out_put_head;
-got_bucket:
-	ob->ec_idx	= ec_idx;
-	ob->ec		= h->s;
-	ec_stripe_new_get(h->s, STRIPE_REF_io);
-
-	ret = add_new_bucket(c, ptrs, devs_may_alloc,
-			     nr_replicas, nr_effective,
-			     have_cache, flags, ob);
-out_put_head:
-	bch2_ec_stripe_head_put(c, h);
-	return ret;
-}
-
-/* Sector allocator */
-
-static bool want_bucket(struct bch_fs *c,
-			struct write_point *wp,
-			struct bch_devs_mask *devs_may_alloc,
-			bool *have_cache, bool ec,
-			struct open_bucket *ob)
-{
-	struct bch_dev *ca = ob_dev(c, ob);
-
-	if (!test_bit(ob->dev, devs_may_alloc->d))
-		return false;
-
-	if (ob->data_type != wp->data_type)
-		return false;
-
-	if (!ca->mi.durability &&
-	    (wp->data_type == BCH_DATA_btree || ec || *have_cache))
-		return false;
-
-	if (ec != (ob->ec != NULL))
-		return false;
-
-	return true;
-}
-
-static int bucket_alloc_set_writepoint(struct bch_fs *c,
-				       struct open_buckets *ptrs,
-				       struct write_point *wp,
-				       struct bch_devs_mask *devs_may_alloc,
-				       unsigned nr_replicas,
-				       unsigned *nr_effective,
-				       bool *have_cache,
-				       bool ec, unsigned flags)
-{
-	struct open_buckets ptrs_skip = { .nr = 0 };
-	struct open_bucket *ob;
-	unsigned i;
-	int ret = 0;
-
-	open_bucket_for_each(c, &wp->ptrs, ob, i) {
-		if (!ret && want_bucket(c, wp, devs_may_alloc,
-					have_cache, ec, ob))
-			ret = add_new_bucket(c, ptrs, devs_may_alloc,
-				       nr_replicas, nr_effective,
-				       have_cache, flags, ob);
-		else
-			ob_push(c, &ptrs_skip, ob);
-	}
-	wp->ptrs = ptrs_skip;
-
-	return ret;
-}
-
-static int bucket_alloc_set_partial(struct bch_fs *c,
-				    struct open_buckets *ptrs,
-				    struct write_point *wp,
-				    struct bch_devs_mask *devs_may_alloc,
-				    unsigned nr_replicas,
-				    unsigned *nr_effective,
-				    bool *have_cache, bool ec,
-				    enum bch_watermark watermark,
-				    unsigned flags)
-{
-	int i, ret = 0;
-
-	if (!c->open_buckets_partial_nr)
-		return 0;
-
-	spin_lock(&c->freelist_lock);
-
-	if (!c->open_buckets_partial_nr)
-		goto unlock;
-
-	for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
-		struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
-
-		if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) {
-			struct bch_dev *ca = ob_dev(c, ob);
-			struct bch_dev_usage usage;
-			u64 avail;
-
-			bch2_dev_usage_read_fast(ca, &usage);
-			avail = dev_buckets_free(ca, usage, watermark);
-			if (!avail)
-				continue;
-
-			array_remove_item(c->open_buckets_partial,
-					  c->open_buckets_partial_nr,
-					  i);
-			ob->on_partial_list = false;
-
-			ret = add_new_bucket(c, ptrs, devs_may_alloc,
-					     nr_replicas, nr_effective,
-					     have_cache, flags, ob);
-			if (ret)
-				break;
-		}
-	}
-unlock:
-	spin_unlock(&c->freelist_lock);
-	return ret;
-}
-
-static int __open_bucket_add_buckets(struct btree_trans *trans,
-			struct open_buckets *ptrs,
-			struct write_point *wp,
-			struct bch_devs_list *devs_have,
-			u16 target,
-			bool erasure_code,
-			unsigned nr_replicas,
-			unsigned *nr_effective,
-			bool *have_cache,
-			enum bch_watermark watermark,
-			unsigned flags,
-			struct closure *_cl)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_devs_mask devs;
-	struct open_bucket *ob;
-	struct closure *cl = NULL;
-	unsigned i;
-	int ret;
-
-	devs = target_rw_devs(c, wp->data_type, target);
-
-	/* Don't allocate from devices we already have pointers to: */
-	darray_for_each(*devs_have, i)
-		__clear_bit(*i, devs.d);
-
-	open_bucket_for_each(c, ptrs, ob, i)
-		__clear_bit(ob->dev, devs.d);
-
-	if (erasure_code && ec_open_bucket(c, ptrs))
-		return 0;
-
-	ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
-				 nr_replicas, nr_effective,
-				 have_cache, erasure_code, flags);
-	if (ret)
-		return ret;
-
-	ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
-				 nr_replicas, nr_effective,
-				 have_cache, erasure_code, watermark, flags);
-	if (ret)
-		return ret;
-
-	if (erasure_code) {
-		ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs,
-					 target,
-					 nr_replicas, nr_effective,
-					 have_cache,
-					 watermark, flags, _cl);
-	} else {
-retry_blocking:
-		/*
-		 * Try nonblocking first, so that if one device is full we'll try from
-		 * other devices:
-		 */
-		ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
-					nr_replicas, nr_effective, have_cache,
-					flags, wp->data_type, watermark, cl);
-		if (ret &&
-		    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
-		    !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
-		    !cl && _cl) {
-			cl = _cl;
-			goto retry_blocking;
-		}
-	}
-
-	return ret;
-}
-
-static int open_bucket_add_buckets(struct btree_trans *trans,
-			struct open_buckets *ptrs,
-			struct write_point *wp,
-			struct bch_devs_list *devs_have,
-			u16 target,
-			unsigned erasure_code,
-			unsigned nr_replicas,
-			unsigned *nr_effective,
-			bool *have_cache,
-			enum bch_watermark watermark,
-			unsigned flags,
-			struct closure *cl)
-{
-	int ret;
-
-	if (erasure_code) {
-		ret = __open_bucket_add_buckets(trans, ptrs, wp,
-				devs_have, target, erasure_code,
-				nr_replicas, nr_effective, have_cache,
-				watermark, flags, cl);
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
-		    bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
-		    bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
-		    bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
-			return ret;
-		if (*nr_effective >= nr_replicas)
-			return 0;
-	}
-
-	ret = __open_bucket_add_buckets(trans, ptrs, wp,
-			devs_have, target, false,
-			nr_replicas, nr_effective, have_cache,
-			watermark, flags, cl);
-	return ret < 0 ? ret : 0;
-}
-
-/**
- * should_drop_bucket - check if this is open_bucket should go away
- * @ob:		open_bucket to predicate on
- * @c:		filesystem handle
- * @ca:		if set, we're killing buckets for a particular device
- * @ec:		if true, we're shutting down erasure coding and killing all ec
- *		open_buckets
- *		otherwise, return true
- * Returns: true if we should kill this open_bucket
- *
- * We're killing open_buckets because we're shutting down a device, erasure
- * coding, or the entire filesystem - check if this open_bucket matches:
- */
-static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
-			       struct bch_dev *ca, bool ec)
-{
-	if (ec) {
-		return ob->ec != NULL;
-	} else if (ca) {
-		bool drop = ob->dev == ca->dev_idx;
-		struct open_bucket *ob2;
-		unsigned i;
-
-		if (!drop && ob->ec) {
-			unsigned nr_blocks;
-
-			mutex_lock(&ob->ec->lock);
-			nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks;
-
-			for (i = 0; i < nr_blocks; i++) {
-				if (!ob->ec->blocks[i])
-					continue;
-
-				ob2 = c->open_buckets + ob->ec->blocks[i];
-				drop |= ob2->dev == ca->dev_idx;
-			}
-			mutex_unlock(&ob->ec->lock);
-		}
-
-		return drop;
-	} else {
-		return true;
-	}
-}
-
-static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
-				 bool ec, struct write_point *wp)
-{
-	struct open_buckets ptrs = { .nr = 0 };
-	struct open_bucket *ob;
-	unsigned i;
-
-	mutex_lock(&wp->lock);
-	open_bucket_for_each(c, &wp->ptrs, ob, i)
-		if (should_drop_bucket(ob, c, ca, ec))
-			bch2_open_bucket_put(c, ob);
-		else
-			ob_push(c, &ptrs, ob);
-	wp->ptrs = ptrs;
-	mutex_unlock(&wp->lock);
-}
-
-void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
-			    bool ec)
-{
-	unsigned i;
-
-	/* Next, close write points that point to this device... */
-	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
-		bch2_writepoint_stop(c, ca, ec, &c->write_points[i]);
-
-	bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point);
-	bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point);
-	bch2_writepoint_stop(c, ca, ec, &c->btree_write_point);
-
-	mutex_lock(&c->btree_reserve_cache_lock);
-	while (c->btree_reserve_cache_nr) {
-		struct btree_alloc *a =
-			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
-
-		bch2_open_buckets_put(c, &a->ob);
-	}
-	mutex_unlock(&c->btree_reserve_cache_lock);
-
-	spin_lock(&c->freelist_lock);
-	i = 0;
-	while (i < c->open_buckets_partial_nr) {
-		struct open_bucket *ob =
-			c->open_buckets + c->open_buckets_partial[i];
-
-		if (should_drop_bucket(ob, c, ca, ec)) {
-			--c->open_buckets_partial_nr;
-			swap(c->open_buckets_partial[i],
-			     c->open_buckets_partial[c->open_buckets_partial_nr]);
-			ob->on_partial_list = false;
-			spin_unlock(&c->freelist_lock);
-			bch2_open_bucket_put(c, ob);
-			spin_lock(&c->freelist_lock);
-		} else {
-			i++;
-		}
-	}
-	spin_unlock(&c->freelist_lock);
-
-	bch2_ec_stop_dev(c, ca);
-}
-
-static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
-						 unsigned long write_point)
-{
-	unsigned hash =
-		hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
-
-	return &c->write_points_hash[hash];
-}
-
-static struct write_point *__writepoint_find(struct hlist_head *head,
-					     unsigned long write_point)
-{
-	struct write_point *wp;
-
-	rcu_read_lock();
-	hlist_for_each_entry_rcu(wp, head, node)
-		if (wp->write_point == write_point)
-			goto out;
-	wp = NULL;
-out:
-	rcu_read_unlock();
-	return wp;
-}
-
-static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
-{
-	u64 stranded	= c->write_points_nr * c->bucket_size_max;
-	u64 free	= bch2_fs_usage_read_short(c).free;
-
-	return stranded * factor > free;
-}
-
-static bool try_increase_writepoints(struct bch_fs *c)
-{
-	struct write_point *wp;
-
-	if (c->write_points_nr == ARRAY_SIZE(c->write_points) ||
-	    too_many_writepoints(c, 32))
-		return false;
-
-	wp = c->write_points + c->write_points_nr++;
-	hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
-	return true;
-}
-
-static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
-{
-	struct bch_fs *c = trans->c;
-	struct write_point *wp;
-	struct open_bucket *ob;
-	unsigned i;
-
-	mutex_lock(&c->write_points_hash_lock);
-	if (c->write_points_nr < old_nr) {
-		mutex_unlock(&c->write_points_hash_lock);
-		return true;
-	}
-
-	if (c->write_points_nr == 1 ||
-	    !too_many_writepoints(c, 8)) {
-		mutex_unlock(&c->write_points_hash_lock);
-		return false;
-	}
-
-	wp = c->write_points + --c->write_points_nr;
-
-	hlist_del_rcu(&wp->node);
-	mutex_unlock(&c->write_points_hash_lock);
-
-	bch2_trans_mutex_lock_norelock(trans, &wp->lock);
-	open_bucket_for_each(c, &wp->ptrs, ob, i)
-		open_bucket_free_unused(c, ob);
-	wp->ptrs.nr = 0;
-	mutex_unlock(&wp->lock);
-	return true;
-}
-
-static struct write_point *writepoint_find(struct btree_trans *trans,
-					   unsigned long write_point)
-{
-	struct bch_fs *c = trans->c;
-	struct write_point *wp, *oldest;
-	struct hlist_head *head;
-
-	if (!(write_point & 1UL)) {
-		wp = (struct write_point *) write_point;
-		bch2_trans_mutex_lock_norelock(trans, &wp->lock);
-		return wp;
-	}
-
-	head = writepoint_hash(c, write_point);
-restart_find:
-	wp = __writepoint_find(head, write_point);
-	if (wp) {
-lock_wp:
-		bch2_trans_mutex_lock_norelock(trans, &wp->lock);
-		if (wp->write_point == write_point)
-			goto out;
-		mutex_unlock(&wp->lock);
-		goto restart_find;
-	}
-restart_find_oldest:
-	oldest = NULL;
-	for (wp = c->write_points;
-	     wp < c->write_points + c->write_points_nr; wp++)
-		if (!oldest || time_before64(wp->last_used, oldest->last_used))
-			oldest = wp;
-
-	bch2_trans_mutex_lock_norelock(trans, &oldest->lock);
-	bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock);
-	if (oldest >= c->write_points + c->write_points_nr ||
-	    try_increase_writepoints(c)) {
-		mutex_unlock(&c->write_points_hash_lock);
-		mutex_unlock(&oldest->lock);
-		goto restart_find_oldest;
-	}
-
-	wp = __writepoint_find(head, write_point);
-	if (wp && wp != oldest) {
-		mutex_unlock(&c->write_points_hash_lock);
-		mutex_unlock(&oldest->lock);
-		goto lock_wp;
-	}
-
-	wp = oldest;
-	hlist_del_rcu(&wp->node);
-	wp->write_point = write_point;
-	hlist_add_head_rcu(&wp->node, head);
-	mutex_unlock(&c->write_points_hash_lock);
-out:
-	wp->last_used = local_clock();
-	return wp;
-}
-
-static noinline void
-deallocate_extra_replicas(struct bch_fs *c,
-			  struct open_buckets *ptrs,
-			  struct open_buckets *ptrs_no_use,
-			  unsigned extra_replicas)
-{
-	struct open_buckets ptrs2 = { 0 };
-	struct open_bucket *ob;
-	unsigned i;
-
-	open_bucket_for_each(c, ptrs, ob, i) {
-		unsigned d = ob_dev(c, ob)->mi.durability;
-
-		if (d && d <= extra_replicas) {
-			extra_replicas -= d;
-			ob_push(c, ptrs_no_use, ob);
-		} else {
-			ob_push(c, &ptrs2, ob);
-		}
-	}
-
-	*ptrs = ptrs2;
-}
-
-/*
- * Get us an open_bucket we can allocate from, return with it locked:
- */
-int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
-			     unsigned target,
-			     unsigned erasure_code,
-			     struct write_point_specifier write_point,
-			     struct bch_devs_list *devs_have,
-			     unsigned nr_replicas,
-			     unsigned nr_replicas_required,
-			     enum bch_watermark watermark,
-			     unsigned flags,
-			     struct closure *cl,
-			     struct write_point **wp_ret)
-{
-	struct bch_fs *c = trans->c;
-	struct write_point *wp;
-	struct open_bucket *ob;
-	struct open_buckets ptrs;
-	unsigned nr_effective, write_points_nr;
-	bool have_cache;
-	int ret;
-	int i;
-
-	if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
-		erasure_code = false;
-
-	BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
-
-	BUG_ON(!nr_replicas || !nr_replicas_required);
-retry:
-	ptrs.nr		= 0;
-	nr_effective	= 0;
-	write_points_nr = c->write_points_nr;
-	have_cache	= false;
-
-	*wp_ret = wp = writepoint_find(trans, write_point.v);
-
-	ret = bch2_trans_relock(trans);
-	if (ret)
-		goto err;
-
-	/* metadata may not allocate on cache devices: */
-	if (wp->data_type != BCH_DATA_user)
-		have_cache = true;
-
-	if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
-		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
-					      target, erasure_code,
-					      nr_replicas, &nr_effective,
-					      &have_cache, watermark,
-					      flags, NULL);
-		if (!ret ||
-		    bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			goto alloc_done;
-
-		/* Don't retry from all devices if we're out of open buckets: */
-		if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
-			int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
-					      target, erasure_code,
-					      nr_replicas, &nr_effective,
-					      &have_cache, watermark,
-					      flags, cl);
-			if (!ret2 ||
-			    bch2_err_matches(ret2, BCH_ERR_transaction_restart) ||
-			    bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) {
-				ret = ret2;
-				goto alloc_done;
-			}
-		}
-
-		/*
-		 * Only try to allocate cache (durability = 0 devices) from the
-		 * specified target:
-		 */
-		have_cache = true;
-
-		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
-					      0, erasure_code,
-					      nr_replicas, &nr_effective,
-					      &have_cache, watermark,
-					      flags, cl);
-	} else {
-		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
-					      target, erasure_code,
-					      nr_replicas, &nr_effective,
-					      &have_cache, watermark,
-					      flags, cl);
-	}
-alloc_done:
-	BUG_ON(!ret && nr_effective < nr_replicas);
-
-	if (erasure_code && !ec_open_bucket(c, &ptrs))
-		pr_debug("failed to get ec bucket: ret %u", ret);
-
-	if (ret == -BCH_ERR_insufficient_devices &&
-	    nr_effective >= nr_replicas_required)
-		ret = 0;
-
-	if (ret)
-		goto err;
-
-	if (nr_effective > nr_replicas)
-		deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas);
-
-	/* Free buckets we didn't use: */
-	open_bucket_for_each(c, &wp->ptrs, ob, i)
-		open_bucket_free_unused(c, ob);
-
-	wp->ptrs = ptrs;
-
-	wp->sectors_free = UINT_MAX;
-
-	open_bucket_for_each(c, &wp->ptrs, ob, i)
-		wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
-
-	BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
-
-	return 0;
-err:
-	open_bucket_for_each(c, &wp->ptrs, ob, i)
-		if (ptrs.nr < ARRAY_SIZE(ptrs.v))
-			ob_push(c, &ptrs, ob);
-		else
-			open_bucket_free_unused(c, ob);
-	wp->ptrs = ptrs;
-
-	mutex_unlock(&wp->lock);
-
-	if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
-	    try_decrease_writepoints(trans, write_points_nr))
-		goto retry;
-
-	if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
-	    bch2_err_matches(ret, BCH_ERR_freelist_empty))
-		return cl
-			? -BCH_ERR_bucket_alloc_blocked
-			: -BCH_ERR_ENOSPC_bucket_alloc;
-
-	return ret;
-}
-
-struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
-{
-	struct bch_dev *ca = ob_dev(c, ob);
-
-	return (struct bch_extent_ptr) {
-		.type	= 1 << BCH_EXTENT_ENTRY_ptr,
-		.gen	= ob->gen,
-		.dev	= ob->dev,
-		.offset	= bucket_to_sector(ca, ob->bucket) +
-			ca->mi.bucket_size -
-			ob->sectors_free,
-	};
-}
-
-void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
-				    struct bkey_i *k, unsigned sectors,
-				    bool cached)
-{
-	bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached);
-}
-
-/*
- * Append pointers to the space we just allocated to @k, and mark @sectors space
- * as allocated out of @ob
- */
-void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
-{
-	bch2_alloc_sectors_done_inlined(c, wp);
-}
-
-static inline void writepoint_init(struct write_point *wp,
-				   enum bch_data_type type)
-{
-	mutex_init(&wp->lock);
-	wp->data_type = type;
-
-	INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
-	INIT_LIST_HEAD(&wp->writes);
-	spin_lock_init(&wp->writes_lock);
-}
-
-void bch2_fs_allocator_foreground_init(struct bch_fs *c)
-{
-	struct open_bucket *ob;
-	struct write_point *wp;
-
-	mutex_init(&c->write_points_hash_lock);
-	c->write_points_nr = ARRAY_SIZE(c->write_points);
-
-	/* open bucket 0 is a sentinal NULL: */
-	spin_lock_init(&c->open_buckets[0].lock);
-
-	for (ob = c->open_buckets + 1;
-	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
-		spin_lock_init(&ob->lock);
-		c->open_buckets_nr_free++;
-
-		ob->freelist = c->open_buckets_freelist;
-		c->open_buckets_freelist = ob - c->open_buckets;
-	}
-
-	writepoint_init(&c->btree_write_point,		BCH_DATA_btree);
-	writepoint_init(&c->rebalance_write_point,	BCH_DATA_user);
-	writepoint_init(&c->copygc_write_point,		BCH_DATA_user);
-
-	for (wp = c->write_points;
-	     wp < c->write_points + c->write_points_nr; wp++) {
-		writepoint_init(wp, BCH_DATA_user);
-
-		wp->last_used	= local_clock();
-		wp->write_point	= (unsigned long) wp;
-		hlist_add_head_rcu(&wp->node,
-				   writepoint_hash(c, wp->write_point));
-	}
-}
-
-static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
-{
-	struct bch_dev *ca = ob_dev(c, ob);
-	unsigned data_type = ob->data_type;
-	barrier(); /* READ_ONCE() doesn't work on bitfields */
-
-	prt_printf(out, "%zu ref %u ",
-		   ob - c->open_buckets,
-		   atomic_read(&ob->pin));
-	bch2_prt_data_type(out, data_type);
-	prt_printf(out, " %u:%llu gen %u allocated %u/%u",
-		   ob->dev, ob->bucket, ob->gen,
-		   ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
-	if (ob->ec)
-		prt_printf(out, " ec idx %llu", ob->ec->idx);
-	if (ob->on_partial_list)
-		prt_str(out, " partial");
-	prt_newline(out);
-}
-
-void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	struct open_bucket *ob;
-
-	out->atomic++;
-
-	for (ob = c->open_buckets;
-	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
-	     ob++) {
-		spin_lock(&ob->lock);
-		if (ob->valid && !ob->on_partial_list)
-			bch2_open_bucket_to_text(out, c, ob);
-		spin_unlock(&ob->lock);
-	}
-
-	--out->atomic;
-}
-
-void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	unsigned i;
-
-	out->atomic++;
-	spin_lock(&c->freelist_lock);
-
-	for (i = 0; i < c->open_buckets_partial_nr; i++)
-		bch2_open_bucket_to_text(out, c,
-				c->open_buckets + c->open_buckets_partial[i]);
-
-	spin_unlock(&c->freelist_lock);
-	--out->atomic;
-}
-
-static const char * const bch2_write_point_states[] = {
-#define x(n)	#n,
-	WRITE_POINT_STATES()
-#undef x
-	NULL
-};
-
-static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
-				     struct write_point *wp)
-{
-	struct open_bucket *ob;
-	unsigned i;
-
-	prt_printf(out, "%lu: ", wp->write_point);
-	prt_human_readable_u64(out, wp->sectors_allocated);
-
-	prt_printf(out, " last wrote: ");
-	bch2_pr_time_units(out, sched_clock() - wp->last_used);
-
-	for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
-		prt_printf(out, " %s: ", bch2_write_point_states[i]);
-		bch2_pr_time_units(out, wp->time[i]);
-	}
-
-	prt_newline(out);
-
-	printbuf_indent_add(out, 2);
-	open_bucket_for_each(c, &wp->ptrs, ob, i)
-		bch2_open_bucket_to_text(out, c, ob);
-	printbuf_indent_sub(out, 2);
-}
-
-void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	struct write_point *wp;
-
-	prt_str(out, "Foreground write points\n");
-	for (wp = c->write_points;
-	     wp < c->write_points + ARRAY_SIZE(c->write_points);
-	     wp++)
-		bch2_write_point_to_text(out, c, wp);
-
-	prt_str(out, "Copygc write point\n");
-	bch2_write_point_to_text(out, c, &c->copygc_write_point);
-
-	prt_str(out, "Rebalance write point\n");
-	bch2_write_point_to_text(out, c, &c->rebalance_write_point);
-
-	prt_str(out, "Btree write point\n");
-	bch2_write_point_to_text(out, c, &c->btree_write_point);
-}
-
-void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	unsigned nr[BCH_DATA_NR];
-
-	memset(nr, 0, sizeof(nr));
-
-	for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
-		nr[c->open_buckets[i].data_type]++;
-
-	printbuf_tabstop_push(out, 24);
-
-	percpu_down_read(&c->mark_lock);
-	prt_printf(out, "hidden\t%llu\n",			bch2_fs_usage_read_one(c, &c->usage_base->b.hidden));
-	prt_printf(out, "btree\t%llu\n",			bch2_fs_usage_read_one(c, &c->usage_base->b.btree));
-	prt_printf(out, "data\t%llu\n",				bch2_fs_usage_read_one(c, &c->usage_base->b.data));
-	prt_printf(out, "cached\t%llu\n",			bch2_fs_usage_read_one(c, &c->usage_base->b.cached));
-	prt_printf(out, "reserved\t%llu\n",			bch2_fs_usage_read_one(c, &c->usage_base->b.reserved));
-	prt_printf(out, "online_reserved\t%llu\n",		percpu_u64_get(c->online_reserved));
-	prt_printf(out, "nr_inodes\t%llu\n",			bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes));
-	percpu_up_read(&c->mark_lock);
-
-	prt_newline(out);
-	prt_printf(out, "freelist_wait\t%s\n",			c->freelist_wait.list.first ? "waiting" : "empty");
-	prt_printf(out, "open buckets allocated\t%i\n",		OPEN_BUCKETS_COUNT - c->open_buckets_nr_free);
-	prt_printf(out, "open buckets total\t%u\n",		OPEN_BUCKETS_COUNT);
-	prt_printf(out, "open_buckets_wait\t%s\n",		c->open_buckets_wait.list.first ? "waiting" : "empty");
-	prt_printf(out, "open_buckets_btree\t%u\n",		nr[BCH_DATA_btree]);
-	prt_printf(out, "open_buckets_user\t%u\n",		nr[BCH_DATA_user]);
-	prt_printf(out, "btree reserve cache\t%u\n",		c->btree_reserve_cache_nr);
-}
-
-void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
-{
-	struct bch_fs *c = ca->fs;
-	struct bch_dev_usage stats = bch2_dev_usage_read(ca);
-	unsigned nr[BCH_DATA_NR];
-
-	memset(nr, 0, sizeof(nr));
-
-	for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
-		nr[c->open_buckets[i].data_type]++;
-
-	printbuf_tabstop_push(out, 12);
-	printbuf_tabstop_push(out, 16);
-	printbuf_tabstop_push(out, 16);
-	printbuf_tabstop_push(out, 16);
-	printbuf_tabstop_push(out, 16);
-
-	bch2_dev_usage_to_text(out, &stats);
-
-	prt_newline(out);
-
-	prt_printf(out, "reserves:\n");
-	for (unsigned i = 0; i < BCH_WATERMARK_NR; i++)
-		prt_printf(out, "%s\t%llu\r\n", bch2_watermarks[i], bch2_dev_buckets_reserved(ca, i));
-
-	prt_newline(out);
-
-	printbuf_tabstops_reset(out);
-	printbuf_tabstop_push(out, 12);
-	printbuf_tabstop_push(out, 16);
-
-	prt_printf(out, "open buckets\t%i\r\n",	ca->nr_open_buckets);
-	prt_printf(out, "buckets to invalidate\t%llu\r\n",	should_invalidate_buckets(ca, stats));
-}
-
-void bch2_print_allocator_stuck(struct bch_fs *c)
-{
-	struct printbuf buf = PRINTBUF;
-
-	prt_printf(&buf, "Allocator stuck? Waited for 10 seconds\n");
-
-	prt_printf(&buf, "Allocator debug:\n");
-	printbuf_indent_add(&buf, 2);
-	bch2_fs_alloc_debug_to_text(&buf, c);
-	printbuf_indent_sub(&buf, 2);
-	prt_newline(&buf);
-
-	for_each_online_member(c, ca) {
-		prt_printf(&buf, "Dev %u:\n", ca->dev_idx);
-		printbuf_indent_add(&buf, 2);
-		bch2_dev_alloc_debug_to_text(&buf, ca);
-		printbuf_indent_sub(&buf, 2);
-		prt_newline(&buf);
-	}
-
-	prt_printf(&buf, "Copygc debug:\n");
-	printbuf_indent_add(&buf, 2);
-	bch2_copygc_wait_to_text(&buf, c);
-	printbuf_indent_sub(&buf, 2);
-	prt_newline(&buf);
-
-	prt_printf(&buf, "Journal debug:\n");
-	printbuf_indent_add(&buf, 2);
-	bch2_journal_debug_to_text(&buf, &c->journal);
-	printbuf_indent_sub(&buf, 2);
-
-	bch2_print_string_as_lines(KERN_ERR, buf.buf);
-	printbuf_exit(&buf);
-}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
deleted file mode 100644
index a42c9730d32a..000000000000
--- a/fs/bcachefs/alloc_foreground.h
+++ /dev/null
@@ -1,235 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ALLOC_FOREGROUND_H
-#define _BCACHEFS_ALLOC_FOREGROUND_H
-
-#include "bcachefs.h"
-#include "alloc_types.h"
-#include "extents.h"
-#include "sb-members.h"
-
-#include <linux/hash.h>
-
-struct bkey;
-struct bch_dev;
-struct bch_fs;
-struct bch_devs_List;
-
-extern const char * const bch2_watermarks[];
-
-void bch2_reset_alloc_cursors(struct bch_fs *);
-
-struct dev_alloc_list {
-	unsigned	nr;
-	u8		devs[BCH_SB_MEMBERS_MAX];
-};
-
-struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
-					  struct dev_stripe_state *,
-					  struct bch_devs_mask *);
-void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
-
-long bch2_bucket_alloc_new_fs(struct bch_dev *);
-
-static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob)
-{
-	return bch2_dev_have_ref(c, ob->dev);
-}
-
-struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
-				      enum bch_watermark, enum bch_data_type,
-				      struct closure *);
-
-static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
-			   struct open_bucket *ob)
-{
-	BUG_ON(obs->nr >= ARRAY_SIZE(obs->v));
-
-	obs->v[obs->nr++] = ob - c->open_buckets;
-}
-
-#define open_bucket_for_each(_c, _obs, _ob, _i)				\
-	for ((_i) = 0;							\
-	     (_i) < (_obs)->nr &&					\
-	     ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true);	\
-	     (_i)++)
-
-static inline struct open_bucket *ec_open_bucket(struct bch_fs *c,
-						 struct open_buckets *obs)
-{
-	struct open_bucket *ob;
-	unsigned i;
-
-	open_bucket_for_each(c, obs, ob, i)
-		if (ob->ec)
-			return ob;
-
-	return NULL;
-}
-
-void bch2_open_bucket_write_error(struct bch_fs *,
-			struct open_buckets *, unsigned);
-
-void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
-
-static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
-{
-	if (atomic_dec_and_test(&ob->pin))
-		__bch2_open_bucket_put(c, ob);
-}
-
-static inline void bch2_open_buckets_put(struct bch_fs *c,
-					 struct open_buckets *ptrs)
-{
-	struct open_bucket *ob;
-	unsigned i;
-
-	open_bucket_for_each(c, ptrs, ob, i)
-		bch2_open_bucket_put(c, ob);
-	ptrs->nr = 0;
-}
-
-static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct write_point *wp)
-{
-	struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 };
-	struct open_bucket *ob;
-	unsigned i;
-
-	open_bucket_for_each(c, &wp->ptrs, ob, i)
-		ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob);
-	wp->ptrs = keep;
-
-	mutex_unlock(&wp->lock);
-
-	bch2_open_buckets_put(c, &ptrs);
-}
-
-static inline void bch2_open_bucket_get(struct bch_fs *c,
-					struct write_point *wp,
-					struct open_buckets *ptrs)
-{
-	struct open_bucket *ob;
-	unsigned i;
-
-	open_bucket_for_each(c, &wp->ptrs, ob, i) {
-		ob->data_type = wp->data_type;
-		atomic_inc(&ob->pin);
-		ob_push(c, ptrs, ob);
-	}
-}
-
-static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c,
-						  unsigned dev, u64 bucket)
-{
-	return c->open_buckets_hash +
-		(jhash_3words(dev, bucket, bucket >> 32, 0) &
-		 (OPEN_BUCKETS_COUNT - 1));
-}
-
-static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket)
-{
-	open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket);
-
-	while (slot) {
-		struct open_bucket *ob = &c->open_buckets[slot];
-
-		if (ob->dev == dev && ob->bucket == bucket)
-			return true;
-
-		slot = ob->hash;
-	}
-
-	return false;
-}
-
-static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket)
-{
-	bool ret;
-
-	if (bch2_bucket_is_open(c, dev, bucket))
-		return true;
-
-	spin_lock(&c->freelist_lock);
-	ret = bch2_bucket_is_open(c, dev, bucket);
-	spin_unlock(&c->freelist_lock);
-
-	return ret;
-}
-
-int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *,
-		      struct dev_stripe_state *, struct bch_devs_mask *,
-		      unsigned, unsigned *, bool *, unsigned,
-		      enum bch_data_type, enum bch_watermark,
-		      struct closure *);
-
-int bch2_alloc_sectors_start_trans(struct btree_trans *,
-				   unsigned, unsigned,
-				   struct write_point_specifier,
-				   struct bch_devs_list *,
-				   unsigned, unsigned,
-				   enum bch_watermark,
-				   unsigned,
-				   struct closure *,
-				   struct write_point **);
-
-struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
-
-/*
- * Append pointers to the space we just allocated to @k, and mark @sectors space
- * as allocated out of @ob
- */
-static inline void
-bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp,
-				       struct bkey_i *k, unsigned sectors,
-				       bool cached)
-{
-	struct open_bucket *ob;
-	unsigned i;
-
-	BUG_ON(sectors > wp->sectors_free);
-	wp->sectors_free	-= sectors;
-	wp->sectors_allocated	+= sectors;
-
-	open_bucket_for_each(c, &wp->ptrs, ob, i) {
-		struct bch_dev *ca = ob_dev(c, ob);
-		struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob);
-
-		ptr.cached = cached ||
-			(!ca->mi.durability &&
-			 wp->data_type == BCH_DATA_user);
-
-		bch2_bkey_append_ptr(k, ptr);
-
-		BUG_ON(sectors > ob->sectors_free);
-		ob->sectors_free -= sectors;
-	}
-}
-
-void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
-				    struct bkey_i *, unsigned, bool);
-void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
-
-void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *, bool);
-
-static inline struct write_point_specifier writepoint_hashed(unsigned long v)
-{
-	return (struct write_point_specifier) { .v = v | 1 };
-}
-
-static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
-{
-	return (struct write_point_specifier) { .v = (unsigned long) wp };
-}
-
-void bch2_fs_allocator_foreground_init(struct bch_fs *);
-
-void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
-void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_fs_alloc_debug_to_text(struct printbuf *, struct bch_fs *);
-void bch2_dev_alloc_debug_to_text(struct printbuf *, struct bch_dev *);
-
-void bch2_print_allocator_stuck(struct bch_fs *);
-
-#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
deleted file mode 100644
index 9bbb28e90b93..000000000000
--- a/fs/bcachefs/alloc_types.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ALLOC_TYPES_H
-#define _BCACHEFS_ALLOC_TYPES_H
-
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
-
-#include "clock_types.h"
-#include "fifo.h"
-
-struct bucket_alloc_state {
-	enum {
-		BTREE_BITMAP_NO,
-		BTREE_BITMAP_YES,
-		BTREE_BITMAP_ANY,
-	}	btree_bitmap;
-
-	u64	buckets_seen;
-	u64	skipped_open;
-	u64	skipped_need_journal_commit;
-	u64	skipped_nocow;
-	u64	skipped_nouse;
-	u64	skipped_mi_btree_bitmap;
-};
-
-#define BCH_WATERMARKS()		\
-	x(stripe)			\
-	x(normal)			\
-	x(copygc)			\
-	x(btree)			\
-	x(btree_copygc)			\
-	x(reclaim)			\
-	x(interior_updates)
-
-enum bch_watermark {
-#define x(name)	BCH_WATERMARK_##name,
-	BCH_WATERMARKS()
-#undef x
-	BCH_WATERMARK_NR,
-};
-
-#define BCH_WATERMARK_BITS	3
-#define BCH_WATERMARK_MASK	~(~0U << BCH_WATERMARK_BITS)
-
-#define OPEN_BUCKETS_COUNT	1024
-
-#define WRITE_POINT_HASH_NR	32
-#define WRITE_POINT_MAX		32
-
-/*
- * 0 is never a valid open_bucket_idx_t:
- */
-typedef u16			open_bucket_idx_t;
-
-struct open_bucket {
-	spinlock_t		lock;
-	atomic_t		pin;
-	open_bucket_idx_t	freelist;
-	open_bucket_idx_t	hash;
-
-	/*
-	 * When an open bucket has an ec_stripe attached, this is the index of
-	 * the block in the stripe this open_bucket corresponds to:
-	 */
-	u8			ec_idx;
-	enum bch_data_type	data_type:6;
-	unsigned		valid:1;
-	unsigned		on_partial_list:1;
-
-	u8			dev;
-	u8			gen;
-	u32			sectors_free;
-	u64			bucket;
-	struct ec_stripe_new	*ec;
-};
-
-#define OPEN_BUCKET_LIST_MAX	15
-
-struct open_buckets {
-	open_bucket_idx_t	nr;
-	open_bucket_idx_t	v[OPEN_BUCKET_LIST_MAX];
-};
-
-struct dev_stripe_state {
-	u64			next_alloc[BCH_SB_MEMBERS_MAX];
-};
-
-#define WRITE_POINT_STATES()		\
-	x(stopped)			\
-	x(waiting_io)			\
-	x(waiting_work)			\
-	x(running)
-
-enum write_point_state {
-#define x(n)	WRITE_POINT_##n,
-	WRITE_POINT_STATES()
-#undef x
-	WRITE_POINT_STATE_NR
-};
-
-struct write_point {
-	struct {
-		struct hlist_node	node;
-		struct mutex		lock;
-		u64			last_used;
-		unsigned long		write_point;
-		enum bch_data_type	data_type;
-
-		/* calculated based on how many pointers we're actually going to use: */
-		unsigned		sectors_free;
-
-		struct open_buckets	ptrs;
-		struct dev_stripe_state	stripe;
-
-		u64			sectors_allocated;
-	} __aligned(SMP_CACHE_BYTES);
-
-	struct {
-		struct work_struct	index_update_work;
-
-		struct list_head	writes;
-		spinlock_t		writes_lock;
-
-		enum write_point_state	state;
-		u64			last_state_change;
-		u64			time[WRITE_POINT_STATE_NR];
-	} __aligned(SMP_CACHE_BYTES);
-};
-
-struct write_point_specifier {
-	unsigned long		v;
-};
-
-#endif /* _BCACHEFS_ALLOC_TYPES_H */
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
deleted file mode 100644
index 692b1c7d5018..000000000000
--- a/fs/bcachefs/backpointers.c
+++ /dev/null
@@ -1,1010 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "bbpos.h"
-#include "alloc_background.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "checksum.h"
-#include "error.h"
-
-#include <linux/mm.h>
-
-static bool extent_matches_bp(struct bch_fs *c,
-			      enum btree_id btree_id, unsigned level,
-			      struct bkey_s_c k,
-			      struct bpos bucket,
-			      struct bch_backpointer bp)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-
-	rcu_read_lock();
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		struct bpos bucket2;
-		struct bch_backpointer bp2;
-
-		if (p.ptr.cached)
-			continue;
-
-		struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
-		if (!ca)
-			continue;
-
-		bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, &bucket2, &bp2);
-		if (bpos_eq(bucket, bucket2) &&
-		    !memcmp(&bp, &bp2, sizeof(bp))) {
-			rcu_read_unlock();
-			return true;
-		}
-	}
-	rcu_read_unlock();
-
-	return false;
-}
-
-int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k,
-			     enum bch_validate_flags flags,
-			     struct printbuf *err)
-{
-	struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
-
-	rcu_read_lock();
-	struct bch_dev *ca = bch2_dev_rcu(c, bp.k->p.inode);
-	if (!ca) {
-		/* these will be caught by fsck */
-		rcu_read_unlock();
-		return 0;
-	}
-
-	struct bpos bucket = bp_pos_to_bucket(ca, bp.k->p);
-	struct bpos bp_pos = bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset);
-	rcu_read_unlock();
-	int ret = 0;
-
-	bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size ||
-			 !bpos_eq(bp.k->p, bp_pos),
-			 c, err,
-			 backpointer_bucket_offset_wrong,
-			 "backpointer bucket_offset wrong");
-fsck_err:
-	return ret;
-}
-
-void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp)
-{
-	prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=",
-	       bch2_btree_id_str(bp->btree_id),
-	       bp->level,
-	       (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT),
-	       (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
-	       bp->bucket_len);
-	bch2_bpos_to_text(out, bp->pos);
-}
-
-void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-	rcu_read_lock();
-	struct bch_dev *ca = bch2_dev_rcu(c, k.k->p.inode);
-	if (ca) {
-		struct bpos bucket = bp_pos_to_bucket(ca, k.k->p);
-		rcu_read_unlock();
-		prt_str(out, "bucket=");
-		bch2_bpos_to_text(out, bucket);
-		prt_str(out, " ");
-	} else {
-		rcu_read_unlock();
-	}
-
-	bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v);
-}
-
-void bch2_backpointer_swab(struct bkey_s k)
-{
-	struct bkey_s_backpointer bp = bkey_s_to_backpointer(k);
-
-	bp.v->bucket_offset	= swab40(bp.v->bucket_offset);
-	bp.v->bucket_len	= swab32(bp.v->bucket_len);
-	bch2_bpos_swab(&bp.v->pos);
-}
-
-static noinline int backpointer_mod_err(struct btree_trans *trans,
-					struct bch_backpointer bp,
-					struct bkey_s_c bp_k,
-					struct bkey_s_c orig_k,
-					bool insert)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-
-	if (insert) {
-		prt_printf(&buf, "existing backpointer found when inserting ");
-		bch2_backpointer_to_text(&buf, &bp);
-		prt_newline(&buf);
-		printbuf_indent_add(&buf, 2);
-
-		prt_printf(&buf, "found ");
-		bch2_bkey_val_to_text(&buf, c, bp_k);
-		prt_newline(&buf);
-
-		prt_printf(&buf, "for ");
-		bch2_bkey_val_to_text(&buf, c, orig_k);
-
-		bch_err(c, "%s", buf.buf);
-	} else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
-		prt_printf(&buf, "backpointer not found when deleting\n");
-		printbuf_indent_add(&buf, 2);
-
-		prt_printf(&buf, "searching for ");
-		bch2_backpointer_to_text(&buf, &bp);
-		prt_newline(&buf);
-
-		prt_printf(&buf, "got ");
-		bch2_bkey_val_to_text(&buf, c, bp_k);
-		prt_newline(&buf);
-
-		prt_printf(&buf, "for ");
-		bch2_bkey_val_to_text(&buf, c, orig_k);
-
-		bch_err(c, "%s", buf.buf);
-	}
-
-	printbuf_exit(&buf);
-
-	if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
-		return bch2_inconsistent_error(c) ? BCH_ERR_erofs_unfixed_errors : 0;
-	} else {
-		return 0;
-	}
-}
-
-int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
-				struct bch_dev *ca,
-				struct bpos bucket,
-				struct bch_backpointer bp,
-				struct bkey_s_c orig_k,
-				bool insert)
-{
-	struct btree_iter bp_iter;
-	struct bkey_s_c k;
-	struct bkey_i_backpointer *bp_k;
-	int ret;
-
-	bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
-	ret = PTR_ERR_OR_ZERO(bp_k);
-	if (ret)
-		return ret;
-
-	bkey_backpointer_init(&bp_k->k_i);
-	bp_k->k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset);
-	bp_k->v = bp;
-
-	if (!insert) {
-		bp_k->k.type = KEY_TYPE_deleted;
-		set_bkey_val_u64s(&bp_k->k, 0);
-	}
-
-	k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
-			       bp_k->k.p,
-			       BTREE_ITER_intent|
-			       BTREE_ITER_slots|
-			       BTREE_ITER_with_updates);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (insert
-	    ? k.k->type
-	    : (k.k->type != KEY_TYPE_backpointer ||
-	       memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp)))) {
-		ret = backpointer_mod_err(trans, bp, k, orig_k, insert);
-		if (ret)
-			goto err;
-	}
-
-	ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0);
-err:
-	bch2_trans_iter_exit(trans, &bp_iter);
-	return ret;
-}
-
-/*
- * Find the next backpointer >= *bp_offset:
- */
-int bch2_get_next_backpointer(struct btree_trans *trans,
-			      struct bch_dev *ca,
-			      struct bpos bucket, int gen,
-			      struct bpos *bp_pos,
-			      struct bch_backpointer *bp,
-			      unsigned iter_flags)
-{
-	struct bpos bp_end_pos = bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0);
-	struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL };
-	struct bkey_s_c k;
-	int ret = 0;
-
-	if (bpos_ge(*bp_pos, bp_end_pos))
-		goto done;
-
-	if (gen >= 0) {
-		k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc,
-				       bucket, BTREE_ITER_cached|iter_flags);
-		ret = bkey_err(k);
-		if (ret)
-			goto out;
-
-		if (k.k->type != KEY_TYPE_alloc_v4 ||
-		    bkey_s_c_to_alloc_v4(k).v->gen != gen)
-			goto done;
-	}
-
-	*bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(ca, bucket, 0));
-
-	for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers,
-				     *bp_pos, iter_flags, k, ret) {
-		if (bpos_ge(k.k->p, bp_end_pos))
-			break;
-
-		*bp_pos = k.k->p;
-		*bp = *bkey_s_c_to_backpointer(k).v;
-		goto out;
-	}
-done:
-	*bp_pos = SPOS_MAX;
-out:
-	bch2_trans_iter_exit(trans, &bp_iter);
-	bch2_trans_iter_exit(trans, &alloc_iter);
-	return ret;
-}
-
-static void backpointer_not_found(struct btree_trans *trans,
-				  struct bpos bp_pos,
-				  struct bch_backpointer bp,
-				  struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-
-	/*
-	 * If we're using the btree write buffer, the backpointer we were
-	 * looking at may have already been deleted - failure to find what it
-	 * pointed to is not an error:
-	 */
-	if (likely(!bch2_backpointers_no_use_write_buffer))
-		return;
-
-	struct bpos bucket;
-	if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket))
-		return;
-
-	prt_printf(&buf, "backpointer doesn't match %s it points to:\n  ",
-		   bp.level ? "btree node" : "extent");
-	prt_printf(&buf, "bucket: ");
-	bch2_bpos_to_text(&buf, bucket);
-	prt_printf(&buf, "\n  ");
-
-	prt_printf(&buf, "backpointer pos: ");
-	bch2_bpos_to_text(&buf, bp_pos);
-	prt_printf(&buf, "\n  ");
-
-	bch2_backpointer_to_text(&buf, &bp);
-	prt_printf(&buf, "\n  ");
-	bch2_bkey_val_to_text(&buf, c, k);
-	if (c->curr_recovery_pass >= BCH_RECOVERY_PASS_check_extents_to_backpointers)
-		bch_err_ratelimited(c, "%s", buf.buf);
-	else
-		bch2_trans_inconsistent(trans, "%s", buf.buf);
-
-	printbuf_exit(&buf);
-}
-
-struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
-					 struct btree_iter *iter,
-					 struct bpos bp_pos,
-					 struct bch_backpointer bp,
-					 unsigned iter_flags)
-{
-	if (likely(!bp.level)) {
-		struct bch_fs *c = trans->c;
-
-		struct bpos bucket;
-		if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket))
-			return bkey_s_c_err(-EIO);
-
-		bch2_trans_node_iter_init(trans, iter,
-					  bp.btree_id,
-					  bp.pos,
-					  0, 0,
-					  iter_flags);
-		struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
-		if (bkey_err(k)) {
-			bch2_trans_iter_exit(trans, iter);
-			return k;
-		}
-
-		if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
-			return k;
-
-		bch2_trans_iter_exit(trans, iter);
-		backpointer_not_found(trans, bp_pos, bp, k);
-		return bkey_s_c_null;
-	} else {
-		struct btree *b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
-
-		if (IS_ERR_OR_NULL(b)) {
-			bch2_trans_iter_exit(trans, iter);
-			return IS_ERR(b) ? bkey_s_c_err(PTR_ERR(b)) : bkey_s_c_null;
-		}
-		return bkey_i_to_s_c(&b->key);
-	}
-}
-
-struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
-					struct btree_iter *iter,
-					struct bpos bp_pos,
-					struct bch_backpointer bp)
-{
-	struct bch_fs *c = trans->c;
-
-	BUG_ON(!bp.level);
-
-	struct bpos bucket;
-	if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket))
-		return ERR_PTR(-EIO);
-
-	bch2_trans_node_iter_init(trans, iter,
-				  bp.btree_id,
-				  bp.pos,
-				  0,
-				  bp.level - 1,
-				  0);
-	struct btree *b = bch2_btree_iter_peek_node(iter);
-	if (IS_ERR_OR_NULL(b))
-		goto err;
-
-	BUG_ON(b->c.level != bp.level - 1);
-
-	if (extent_matches_bp(c, bp.btree_id, bp.level,
-			      bkey_i_to_s_c(&b->key),
-			      bucket, bp))
-		return b;
-
-	if (btree_node_will_make_reachable(b)) {
-		b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
-	} else {
-		backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key));
-		b = NULL;
-	}
-err:
-	bch2_trans_iter_exit(trans, iter);
-	return b;
-}
-
-static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter,
-					struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter alloc_iter = { NULL };
-	struct bkey_s_c alloc_k;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	struct bpos bucket;
-	if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) {
-		if (fsck_err(c, backpointer_to_missing_device,
-			     "backpointer for missing device:\n%s",
-			     (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-			ret = bch2_btree_delete_at(trans, bp_iter, 0);
-		goto out;
-	}
-
-	alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, 0);
-	ret = bkey_err(alloc_k);
-	if (ret)
-		goto out;
-
-	if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c,
-			backpointer_to_missing_alloc,
-			"backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
-			alloc_iter.pos.inode, alloc_iter.pos.offset,
-			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		ret = bch2_btree_delete_at(trans, bp_iter, 0);
-		goto out;
-	}
-out:
-fsck_err:
-	bch2_trans_iter_exit(trans, &alloc_iter);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-/* verify that every backpointer has a corresponding alloc key */
-int bch2_check_btree_backpointers(struct bch_fs *c)
-{
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter,
-			BTREE_ID_backpointers, POS_MIN, 0, k,
-			NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-		  bch2_check_btree_backpointer(trans, &iter, k)));
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
-{
-	return bpos_eq(l.k->p, r.k->p) &&
-		bkey_bytes(l.k) == bkey_bytes(r.k) &&
-		!memcmp(l.v, r.v, bkey_val_bytes(l.k));
-}
-
-struct extents_to_bp_state {
-	struct bpos	bucket_start;
-	struct bpos	bucket_end;
-	struct bkey_buf last_flushed;
-};
-
-static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree,
-			       struct bkey_s_c extent, unsigned dev)
-{
-	struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent);
-	int ret = PTR_ERR_OR_ZERO(n);
-	if (ret)
-		return ret;
-
-	bch2_bkey_drop_device(bkey_i_to_s(n), dev);
-	return bch2_btree_insert_trans(trans, btree, n, 0);
-}
-
-static int check_extent_checksum(struct btree_trans *trans,
-				 enum btree_id btree, struct bkey_s_c extent,
-				 enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	struct printbuf buf = PRINTBUF;
-	void *data_buf = NULL;
-	struct bio *bio = NULL;
-	size_t bytes;
-	int ret = 0;
-
-	if (bkey_is_btree_ptr(extent.k))
-		return false;
-
-	bkey_for_each_ptr_decode(extent.k, ptrs, p, entry)
-		if (p.ptr.dev == dev)
-			goto found;
-	BUG();
-found:
-	if (!p.crc.csum_type)
-		return false;
-
-	bytes = p.crc.compressed_size << 9;
-
-	struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ);
-	if (!ca)
-		return false;
-
-	data_buf = kvmalloc(bytes, GFP_KERNEL);
-	if (!data_buf) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	bio = bio_alloc(ca->disk_sb.bdev, buf_pages(data_buf, bytes), REQ_OP_READ, GFP_KERNEL);
-	bio->bi_iter.bi_sector = p.ptr.offset;
-	bch2_bio_map(bio, data_buf, bytes);
-	ret = submit_bio_wait(bio);
-	if (ret)
-		goto err;
-
-	prt_str(&buf, "extents pointing to same space, but first extent checksum bad:");
-	prt_printf(&buf, "\n  %s ", bch2_btree_id_str(btree));
-	bch2_bkey_val_to_text(&buf, c, extent);
-	prt_printf(&buf, "\n  %s ", bch2_btree_id_str(o_btree));
-	bch2_bkey_val_to_text(&buf, c, extent2);
-
-	struct nonce nonce = extent_nonce(extent.k->version, p.crc);
-	struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes);
-	if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum),
-			c, dup_backpointer_to_bad_csum_extent,
-			"%s", buf.buf))
-		ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1;
-fsck_err:
-err:
-	if (bio)
-		bio_put(bio);
-	kvfree(data_buf);
-	percpu_ref_put(&ca->io_ref);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int check_bp_exists(struct btree_trans *trans,
-			   struct extents_to_bp_state *s,
-			   struct bpos bucket,
-			   struct bch_backpointer bp,
-			   struct bkey_s_c orig_k)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter bp_iter = {};
-	struct btree_iter other_extent_iter = {};
-	struct printbuf buf = PRINTBUF;
-	struct bkey_s_c bp_k;
-	struct bkey_buf tmp;
-	int ret = 0;
-
-	bch2_bkey_buf_init(&tmp);
-
-	struct bch_dev *ca = bch2_dev_bucket_tryget(c, bucket);
-	if (!ca) {
-		prt_str(&buf, "extent for nonexistent device:bucket ");
-		bch2_bpos_to_text(&buf, bucket);
-		prt_str(&buf, "\n  ");
-		bch2_bkey_val_to_text(&buf, c, orig_k);
-		bch_err(c, "%s", buf.buf);
-		ret = -BCH_ERR_fsck_repair_unimplemented;
-		goto err;
-	}
-
-	if (bpos_lt(bucket, s->bucket_start) ||
-	    bpos_gt(bucket, s->bucket_end))
-		goto out;
-
-	bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
-				  bucket_pos_to_bp(ca, bucket, bp.bucket_offset),
-				  0);
-	ret = bkey_err(bp_k);
-	if (ret)
-		goto err;
-
-	if (bp_k.k->type != KEY_TYPE_backpointer ||
-	    memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) {
-		bch2_bkey_buf_reassemble(&tmp, c, orig_k);
-
-		if (!bkey_and_val_eq(orig_k, bkey_i_to_s_c(s->last_flushed.k))) {
-			if (bp.level) {
-				bch2_trans_unlock(trans);
-				bch2_btree_interior_updates_flush(c);
-			}
-
-			ret = bch2_btree_write_buffer_flush_sync(trans);
-			if (ret)
-				goto err;
-
-			bch2_bkey_buf_copy(&s->last_flushed, c, tmp.k);
-			ret = -BCH_ERR_transaction_restart_write_buffer_flush;
-			goto out;
-		}
-
-		goto check_existing_bp;
-	}
-out:
-err:
-fsck_err:
-	bch2_trans_iter_exit(trans, &other_extent_iter);
-	bch2_trans_iter_exit(trans, &bp_iter);
-	bch2_bkey_buf_exit(&tmp, c);
-	bch2_dev_put(ca);
-	printbuf_exit(&buf);
-	return ret;
-check_existing_bp:
-	/* Do we have a backpointer for a different extent? */
-	if (bp_k.k->type != KEY_TYPE_backpointer)
-		goto missing;
-
-	struct bch_backpointer other_bp = *bkey_s_c_to_backpointer(bp_k).v;
-
-	struct bkey_s_c other_extent =
-		bch2_backpointer_get_key(trans, &other_extent_iter, bp_k.k->p, other_bp, 0);
-	ret = bkey_err(other_extent);
-	if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
-		ret = 0;
-	if (ret)
-		goto err;
-
-	if (!other_extent.k)
-		goto missing;
-
-	if (bch2_extents_match(orig_k, other_extent)) {
-		printbuf_reset(&buf);
-		prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n  ");
-		bch2_bkey_val_to_text(&buf, c, orig_k);
-		prt_str(&buf, "\n  ");
-		bch2_bkey_val_to_text(&buf, c, other_extent);
-		bch_err(c, "%s", buf.buf);
-
-		if (other_extent.k->size <= orig_k.k->size) {
-			ret = drop_dev_and_update(trans, other_bp.btree_id, other_extent, bucket.inode);
-			if (ret)
-				goto err;
-			goto out;
-		} else {
-			ret = drop_dev_and_update(trans, bp.btree_id, orig_k, bucket.inode);
-			if (ret)
-				goto err;
-			goto missing;
-		}
-	}
-
-	ret = check_extent_checksum(trans, other_bp.btree_id, other_extent, bp.btree_id, orig_k, bucket.inode);
-	if (ret < 0)
-		goto err;
-	if (ret) {
-		ret = 0;
-		goto missing;
-	}
-
-	ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.btree_id, other_extent, bucket.inode);
-	if (ret < 0)
-		goto err;
-	if (ret) {
-		ret = 0;
-		goto out;
-	}
-
-	printbuf_reset(&buf);
-	prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n  ", bucket.inode);
-	bch2_bkey_val_to_text(&buf, c, orig_k);
-	prt_str(&buf, "\n  ");
-	bch2_bkey_val_to_text(&buf, c, other_extent);
-	bch_err(c, "%s", buf.buf);
-	ret = -BCH_ERR_fsck_repair_unimplemented;
-	goto err;
-missing:
-	printbuf_reset(&buf);
-	prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
-	       bch2_btree_id_str(bp.btree_id), bp.level);
-	bch2_bkey_val_to_text(&buf, c, orig_k);
-	prt_printf(&buf, "\n  got:   ");
-	bch2_bkey_val_to_text(&buf, c, bp_k);
-
-	struct bkey_i_backpointer n_bp_k;
-	bkey_backpointer_init(&n_bp_k.k_i);
-	n_bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset);
-	n_bp_k.v = bp;
-	prt_printf(&buf, "\n  want:  ");
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i));
-
-	if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
-		ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, orig_k, true);
-
-	goto out;
-}
-
-static int check_extent_to_backpointers(struct btree_trans *trans,
-					struct extents_to_bp_state *s,
-					enum btree_id btree, unsigned level,
-					struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_ptrs_c ptrs;
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	int ret;
-
-	ptrs = bch2_bkey_ptrs_c(k);
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		struct bpos bucket_pos;
-		struct bch_backpointer bp;
-
-		if (p.ptr.cached)
-			continue;
-
-		rcu_read_lock();
-		struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
-		if (ca)
-			bch2_extent_ptr_to_bp(c, ca, btree, level, k, p, entry, &bucket_pos, &bp);
-		rcu_read_unlock();
-
-		if (!ca)
-			continue;
-
-		ret = check_bp_exists(trans, s, bucket_pos, bp, k);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-static int check_btree_root_to_backpointers(struct btree_trans *trans,
-					    struct extents_to_bp_state *s,
-					    enum btree_id btree_id,
-					    int *level)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct btree *b;
-	struct bkey_s_c k;
-	int ret;
-retry:
-	bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN,
-				  0, bch2_btree_id_root(c, btree_id)->b->c.level, 0);
-	b = bch2_btree_iter_peek_node(&iter);
-	ret = PTR_ERR_OR_ZERO(b);
-	if (ret)
-		goto err;
-
-	if (b != btree_node_root(c, b)) {
-		bch2_trans_iter_exit(trans, &iter);
-		goto retry;
-	}
-
-	*level = b->c.level;
-
-	k = bkey_i_to_s_c(&b->key);
-	ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
-{
-	return (struct bbpos) {
-		.btree	= bp.btree_id,
-		.pos	= bp.pos,
-	};
-}
-
-static u64 mem_may_pin_bytes(struct bch_fs *c)
-{
-	struct sysinfo i;
-	si_meminfo(&i);
-
-	u64 mem_bytes = i.totalram * i.mem_unit;
-	return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100);
-}
-
-static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
-{
-	return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size);
-}
-
-static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
-					u64 btree_leaf_mask,
-					u64 btree_interior_mask,
-					struct bbpos start, struct bbpos *end)
-{
-	struct bch_fs *c = trans->c;
-	s64 mem_may_pin = mem_may_pin_bytes(c);
-	int ret = 0;
-
-	btree_interior_mask |= btree_leaf_mask;
-
-	c->btree_cache.pinned_nodes_leaf_mask		= btree_leaf_mask;
-	c->btree_cache.pinned_nodes_interior_mask	= btree_interior_mask;
-	c->btree_cache.pinned_nodes_start		= start;
-	c->btree_cache.pinned_nodes_end			= *end = BBPOS_MAX;
-
-	for (enum btree_id btree = start.btree;
-	     btree < BTREE_ID_NR && !ret;
-	     btree++) {
-		unsigned depth = ((1U << btree) & btree_leaf_mask) ? 0 : 1;
-		struct btree_iter iter;
-		struct btree *b;
-
-		if (!((1U << btree) & btree_leaf_mask) &&
-		    !((1U << btree) & btree_interior_mask))
-			continue;
-
-		__for_each_btree_node(trans, iter, btree,
-				      btree == start.btree ? start.pos : POS_MIN,
-				      0, depth, BTREE_ITER_prefetch, b, ret) {
-			mem_may_pin -= btree_buf_bytes(b);
-			if (mem_may_pin <= 0) {
-				c->btree_cache.pinned_nodes_end = *end =
-					BBPOS(btree, b->key.k.p);
-				bch2_trans_iter_exit(trans, &iter);
-				return 0;
-			}
-		}
-		bch2_trans_iter_exit(trans, &iter);
-	}
-
-	return ret;
-}
-
-static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
-						   struct extents_to_bp_state *s)
-{
-	struct bch_fs *c = trans->c;
-	int ret = 0;
-
-	for (enum btree_id btree_id = 0;
-	     btree_id < btree_id_nr_alive(c);
-	     btree_id++) {
-		int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
-
-		ret = commit_do(trans, NULL, NULL,
-				BCH_TRANS_COMMIT_no_enospc,
-				check_btree_root_to_backpointers(trans, s, btree_id, &level));
-		if (ret)
-			return ret;
-
-		while (level >= depth) {
-			struct btree_iter iter;
-			bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, level,
-						  BTREE_ITER_prefetch);
-
-			ret = for_each_btree_key_continue(trans, iter, 0, k, ({
-				check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
-				bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-			}));
-			if (ret)
-				return ret;
-
-			--level;
-		}
-	}
-
-	return 0;
-}
-
-int bch2_check_extents_to_backpointers(struct bch_fs *c)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct extents_to_bp_state s = { .bucket_start = POS_MIN };
-	int ret;
-
-	bch2_bkey_buf_init(&s.last_flushed);
-	bkey_init(&s.last_flushed.k->k);
-
-	while (1) {
-		struct bbpos end;
-		ret = bch2_get_btree_in_memory_pos(trans,
-				BIT_ULL(BTREE_ID_backpointers),
-				BIT_ULL(BTREE_ID_backpointers),
-				BBPOS(BTREE_ID_backpointers, s.bucket_start), &end);
-		if (ret)
-			break;
-
-		s.bucket_end = end.pos;
-
-		if ( bpos_eq(s.bucket_start, POS_MIN) &&
-		    !bpos_eq(s.bucket_end, SPOS_MAX))
-			bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
-				    __func__, btree_nodes_fit_in_ram(c));
-
-		if (!bpos_eq(s.bucket_start, POS_MIN) ||
-		    !bpos_eq(s.bucket_end, SPOS_MAX)) {
-			struct printbuf buf = PRINTBUF;
-
-			prt_str(&buf, "check_extents_to_backpointers(): ");
-			bch2_bpos_to_text(&buf, s.bucket_start);
-			prt_str(&buf, "-");
-			bch2_bpos_to_text(&buf, s.bucket_end);
-
-			bch_verbose(c, "%s", buf.buf);
-			printbuf_exit(&buf);
-		}
-
-		ret = bch2_check_extents_to_backpointers_pass(trans, &s);
-		if (ret || bpos_eq(s.bucket_end, SPOS_MAX))
-			break;
-
-		s.bucket_start = bpos_successor(s.bucket_end);
-	}
-	bch2_trans_put(trans);
-	bch2_bkey_buf_exit(&s.last_flushed, c);
-
-	c->btree_cache.pinned_nodes_leaf_mask = 0;
-	c->btree_cache.pinned_nodes_interior_mask = 0;
-
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int check_one_backpointer(struct btree_trans *trans,
-				 struct bbpos start,
-				 struct bbpos end,
-				 struct bkey_s_c_backpointer bp,
-				 struct bpos *last_flushed_pos)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bbpos pos = bp_to_bbpos(*bp.v);
-	struct bkey_s_c k;
-	struct printbuf buf = PRINTBUF;
-	int ret;
-
-	if (bbpos_cmp(pos, start) < 0 ||
-	    bbpos_cmp(pos, end) > 0)
-		return 0;
-
-	k = bch2_backpointer_get_key(trans, &iter, bp.k->p, *bp.v, 0);
-	ret = bkey_err(k);
-	if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
-		return 0;
-	if (ret)
-		return ret;
-
-	if (!k.k && !bpos_eq(*last_flushed_pos, bp.k->p)) {
-		*last_flushed_pos = bp.k->p;
-		ret = bch2_btree_write_buffer_flush_sync(trans) ?:
-			-BCH_ERR_transaction_restart_write_buffer_flush;
-		goto out;
-	}
-
-	if (fsck_err_on(!k.k, c,
-			backpointer_to_missing_ptr,
-			"backpointer for missing %s\n  %s",
-			bp.v->level ? "btree node" : "extent",
-			(bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
-		ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
-		goto out;
-	}
-out:
-fsck_err:
-	bch2_trans_iter_exit(trans, &iter);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
-						   struct bbpos start,
-						   struct bbpos end)
-{
-	struct bpos last_flushed_pos = SPOS_MAX;
-
-	return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
-				  POS_MIN, BTREE_ITER_prefetch, k,
-				  NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-		check_one_backpointer(trans, start, end,
-				      bkey_s_c_to_backpointer(k),
-				      &last_flushed_pos));
-}
-
-int bch2_check_backpointers_to_extents(struct bch_fs *c)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end;
-	int ret;
-
-	while (1) {
-		ret = bch2_get_btree_in_memory_pos(trans,
-						   (1U << BTREE_ID_extents)|
-						   (1U << BTREE_ID_reflink),
-						   ~0,
-						   start, &end);
-		if (ret)
-			break;
-
-		if (!bbpos_cmp(start, BBPOS_MIN) &&
-		    bbpos_cmp(end, BBPOS_MAX))
-			bch_verbose(c, "%s(): extents do not fit in ram, running in multiple passes with %zu nodes per pass",
-				    __func__, btree_nodes_fit_in_ram(c));
-
-		if (bbpos_cmp(start, BBPOS_MIN) ||
-		    bbpos_cmp(end, BBPOS_MAX)) {
-			struct printbuf buf = PRINTBUF;
-
-			prt_str(&buf, "check_backpointers_to_extents(): ");
-			bch2_bbpos_to_text(&buf, start);
-			prt_str(&buf, "-");
-			bch2_bbpos_to_text(&buf, end);
-
-			bch_verbose(c, "%s", buf.buf);
-			printbuf_exit(&buf);
-		}
-
-		ret = bch2_check_backpointers_to_extents_pass(trans, start, end);
-		if (ret || !bbpos_cmp(end, BBPOS_MAX))
-			break;
-
-		start = bbpos_successor(end);
-	}
-	bch2_trans_put(trans);
-
-	c->btree_cache.pinned_nodes_leaf_mask = 0;
-	c->btree_cache.pinned_nodes_interior_mask = 0;
-
-	bch_err_fn(c, ret);
-	return ret;
-}
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
deleted file mode 100644
index 6021de1c5e98..000000000000
--- a/fs/bcachefs/backpointers.h
+++ /dev/null
@@ -1,172 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
-#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
-
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "error.h"
-#include "super.h"
-
-static inline u64 swab40(u64 x)
-{
-	return (((x & 0x00000000ffULL) << 32)|
-		((x & 0x000000ff00ULL) << 16)|
-		((x & 0x0000ff0000ULL) >>  0)|
-		((x & 0x00ff000000ULL) >> 16)|
-		((x & 0xff00000000ULL) >> 32));
-}
-
-int bch2_backpointer_invalid(struct bch_fs *, struct bkey_s_c k,
-			     enum bch_validate_flags, struct printbuf *);
-void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
-void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-void bch2_backpointer_swab(struct bkey_s);
-
-#define bch2_bkey_ops_backpointer ((struct bkey_ops) {	\
-	.key_invalid	= bch2_backpointer_invalid,	\
-	.val_to_text	= bch2_backpointer_k_to_text,	\
-	.swab		= bch2_backpointer_swab,	\
-	.min_val_size	= 32,				\
-})
-
-#define MAX_EXTENT_COMPRESS_RATIO_SHIFT		10
-
-/*
- * Convert from pos in backpointer btree to pos of corresponding bucket in alloc
- * btree:
- */
-static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos bp_pos)
-{
-	u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
-
-	return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector));
-}
-
-static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket)
-{
-	rcu_read_lock();
-	struct bch_dev *ca = bch2_dev_rcu(c, bp_pos.inode);
-	if (ca)
-		*bucket = bp_pos_to_bucket(ca, bp_pos);
-	rcu_read_unlock();
-	return ca != NULL;
-}
-
-static inline bool bp_pos_to_bucket_nodev(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket)
-{
-	return !bch2_fs_inconsistent_on(!bp_pos_to_bucket_nodev_noerror(c, bp_pos, bucket),
-					c, "backpointer for missing device %llu", bp_pos.inode);
-}
-
-static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca,
-						   struct bpos bucket,
-						   u64 bucket_offset)
-{
-	return POS(bucket.inode,
-		   (bucket_to_sector(ca, bucket.offset) <<
-		    MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
-}
-
-/*
- * Convert from pos in alloc btree + bucket offset to pos in backpointer btree:
- */
-static inline struct bpos bucket_pos_to_bp(const struct bch_dev *ca,
-					   struct bpos bucket,
-					   u64 bucket_offset)
-{
-	struct bpos ret = bucket_pos_to_bp_noerror(ca, bucket, bucket_offset);
-	EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(ca, ret)));
-	return ret;
-}
-
-int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bch_dev *,
-				struct bpos bucket, struct bch_backpointer, struct bkey_s_c, bool);
-
-static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
-				struct bch_dev *ca,
-				struct bpos bucket,
-				struct bch_backpointer bp,
-				struct bkey_s_c orig_k,
-				bool insert)
-{
-	if (unlikely(bch2_backpointers_no_use_write_buffer))
-		return bch2_bucket_backpointer_mod_nowritebuffer(trans, ca, bucket, bp, orig_k, insert);
-
-	struct bkey_i_backpointer bp_k;
-
-	bkey_backpointer_init(&bp_k.k_i);
-	bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset);
-	bp_k.v = bp;
-
-	if (!insert) {
-		bp_k.k.type = KEY_TYPE_deleted;
-		set_bkey_val_u64s(&bp_k.k, 0);
-	}
-
-	return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i);
-}
-
-static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
-							 struct extent_ptr_decoded p,
-							 const union bch_extent_entry *entry)
-{
-	switch (k.k->type) {
-	case KEY_TYPE_btree_ptr:
-	case KEY_TYPE_btree_ptr_v2:
-		return BCH_DATA_btree;
-	case KEY_TYPE_extent:
-	case KEY_TYPE_reflink_v:
-		return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user;
-	case KEY_TYPE_stripe: {
-		const struct bch_extent_ptr *ptr = &entry->ptr;
-		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-
-		BUG_ON(ptr < s.v->ptrs ||
-		       ptr >= s.v->ptrs + s.v->nr_blocks);
-
-		return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant
-			? BCH_DATA_parity
-			: BCH_DATA_user;
-	}
-	default:
-		BUG();
-	}
-}
-
-static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca,
-			   enum btree_id btree_id, unsigned level,
-			   struct bkey_s_c k, struct extent_ptr_decoded p,
-			   const union bch_extent_entry *entry,
-			   struct bpos *bucket_pos, struct bch_backpointer *bp)
-{
-	enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
-	s64 sectors = level ? btree_sectors(c) : k.k->size;
-	u32 bucket_offset;
-
-	*bucket_pos = PTR_BUCKET_POS_OFFSET(ca, &p.ptr, &bucket_offset);
-	*bp = (struct bch_backpointer) {
-		.btree_id	= btree_id,
-		.level		= level,
-		.data_type	= data_type,
-		.bucket_offset	= ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) +
-			p.crc.offset,
-		.bucket_len	= ptr_disk_sectors(sectors, p),
-		.pos		= k.k->p,
-	};
-}
-
-int bch2_get_next_backpointer(struct btree_trans *, struct bch_dev *ca, struct bpos, int,
-			      struct bpos *, struct bch_backpointer *, unsigned);
-struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
-					 struct bpos, struct bch_backpointer,
-					 unsigned);
-struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *,
-					struct bpos, struct bch_backpointer);
-
-int bch2_check_btree_backpointers(struct bch_fs *);
-int bch2_check_extents_to_backpointers(struct bch_fs *);
-int bch2_check_backpointers_to_extents(struct bch_fs *);
-
-#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */
diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h
deleted file mode 100644
index be2edced5213..000000000000
--- a/fs/bcachefs/bbpos.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BBPOS_H
-#define _BCACHEFS_BBPOS_H
-
-#include "bbpos_types.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-
-static inline int bbpos_cmp(struct bbpos l, struct bbpos r)
-{
-	return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos);
-}
-
-static inline struct bbpos bbpos_successor(struct bbpos pos)
-{
-	if (bpos_cmp(pos.pos, SPOS_MAX)) {
-		pos.pos = bpos_successor(pos.pos);
-		return pos;
-	}
-
-	if (pos.btree != BTREE_ID_NR) {
-		pos.btree++;
-		pos.pos = POS_MIN;
-		return pos;
-	}
-
-	BUG();
-}
-
-static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos)
-{
-	prt_str(out, bch2_btree_id_str(pos.btree));
-	prt_char(out, ':');
-	bch2_bpos_to_text(out, pos.pos);
-}
-
-#endif /* _BCACHEFS_BBPOS_H */
diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h
deleted file mode 100644
index f63893344f80..000000000000
--- a/fs/bcachefs/bbpos_types.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BBPOS_TYPES_H
-#define _BCACHEFS_BBPOS_TYPES_H
-
-struct bbpos {
-	enum btree_id		btree;
-	struct bpos		pos;
-};
-
-static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
-{
-	return (struct bbpos) { btree, pos };
-}
-
-#define BBPOS_MIN	BBPOS(0, POS_MIN)
-#define BBPOS_MAX	BBPOS(BTREE_ID_NR - 1, SPOS_MAX)
-
-#endif /* _BCACHEFS_BBPOS_TYPES_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
deleted file mode 100644
index bc0ea2c4efef..000000000000
--- a/fs/bcachefs/bcachefs.h
+++ /dev/null
@@ -1,1282 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_H
-#define _BCACHEFS_H
-
-/*
- * SOME HIGH LEVEL CODE DOCUMENTATION:
- *
- * Bcache mostly works with cache sets, cache devices, and backing devices.
- *
- * Support for multiple cache devices hasn't quite been finished off yet, but
- * it's about 95% plumbed through. A cache set and its cache devices is sort of
- * like a md raid array and its component devices. Most of the code doesn't care
- * about individual cache devices, the main abstraction is the cache set.
- *
- * Multiple cache devices is intended to give us the ability to mirror dirty
- * cached data and metadata, without mirroring clean cached data.
- *
- * Backing devices are different, in that they have a lifetime independent of a
- * cache set. When you register a newly formatted backing device it'll come up
- * in passthrough mode, and then you can attach and detach a backing device from
- * a cache set at runtime - while it's mounted and in use. Detaching implicitly
- * invalidates any cached data for that backing device.
- *
- * A cache set can have multiple (many) backing devices attached to it.
- *
- * There's also flash only volumes - this is the reason for the distinction
- * between struct cached_dev and struct bcache_device. A flash only volume
- * works much like a bcache device that has a backing device, except the
- * "cached" data is always dirty. The end result is that we get thin
- * provisioning with very little additional code.
- *
- * Flash only volumes work but they're not production ready because the moving
- * garbage collector needs more work. More on that later.
- *
- * BUCKETS/ALLOCATION:
- *
- * Bcache is primarily designed for caching, which means that in normal
- * operation all of our available space will be allocated. Thus, we need an
- * efficient way of deleting things from the cache so we can write new things to
- * it.
- *
- * To do this, we first divide the cache device up into buckets. A bucket is the
- * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
- * works efficiently.
- *
- * Each bucket has a 16 bit priority, and an 8 bit generation associated with
- * it. The gens and priorities for all the buckets are stored contiguously and
- * packed on disk (in a linked list of buckets - aside from the superblock, all
- * of bcache's metadata is stored in buckets).
- *
- * The priority is used to implement an LRU. We reset a bucket's priority when
- * we allocate it or on cache it, and every so often we decrement the priority
- * of each bucket. It could be used to implement something more sophisticated,
- * if anyone ever gets around to it.
- *
- * The generation is used for invalidating buckets. Each pointer also has an 8
- * bit generation embedded in it; for a pointer to be considered valid, its gen
- * must match the gen of the bucket it points into.  Thus, to reuse a bucket all
- * we have to do is increment its gen (and write its new gen to disk; we batch
- * this up).
- *
- * Bcache is entirely COW - we never write twice to a bucket, even buckets that
- * contain metadata (including btree nodes).
- *
- * THE BTREE:
- *
- * Bcache is in large part design around the btree.
- *
- * At a high level, the btree is just an index of key -> ptr tuples.
- *
- * Keys represent extents, and thus have a size field. Keys also have a variable
- * number of pointers attached to them (potentially zero, which is handy for
- * invalidating the cache).
- *
- * The key itself is an inode:offset pair. The inode number corresponds to a
- * backing device or a flash only volume. The offset is the ending offset of the
- * extent within the inode - not the starting offset; this makes lookups
- * slightly more convenient.
- *
- * Pointers contain the cache device id, the offset on that device, and an 8 bit
- * generation number. More on the gen later.
- *
- * Index lookups are not fully abstracted - cache lookups in particular are
- * still somewhat mixed in with the btree code, but things are headed in that
- * direction.
- *
- * Updates are fairly well abstracted, though. There are two different ways of
- * updating the btree; insert and replace.
- *
- * BTREE_INSERT will just take a list of keys and insert them into the btree -
- * overwriting (possibly only partially) any extents they overlap with. This is
- * used to update the index after a write.
- *
- * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
- * overwriting a key that matches another given key. This is used for inserting
- * data into the cache after a cache miss, and for background writeback, and for
- * the moving garbage collector.
- *
- * There is no "delete" operation; deleting things from the index is
- * accomplished by either by invalidating pointers (by incrementing a bucket's
- * gen) or by inserting a key with 0 pointers - which will overwrite anything
- * previously present at that location in the index.
- *
- * This means that there are always stale/invalid keys in the btree. They're
- * filtered out by the code that iterates through a btree node, and removed when
- * a btree node is rewritten.
- *
- * BTREE NODES:
- *
- * Our unit of allocation is a bucket, and we can't arbitrarily allocate and
- * free smaller than a bucket - so, that's how big our btree nodes are.
- *
- * (If buckets are really big we'll only use part of the bucket for a btree node
- * - no less than 1/4th - but a bucket still contains no more than a single
- * btree node. I'd actually like to change this, but for now we rely on the
- * bucket's gen for deleting btree nodes when we rewrite/split a node.)
- *
- * Anyways, btree nodes are big - big enough to be inefficient with a textbook
- * btree implementation.
- *
- * The way this is solved is that btree nodes are internally log structured; we
- * can append new keys to an existing btree node without rewriting it. This
- * means each set of keys we write is sorted, but the node is not.
- *
- * We maintain this log structure in memory - keeping 1Mb of keys sorted would
- * be expensive, and we have to distinguish between the keys we have written and
- * the keys we haven't. So to do a lookup in a btree node, we have to search
- * each sorted set. But we do merge written sets together lazily, so the cost of
- * these extra searches is quite low (normally most of the keys in a btree node
- * will be in one big set, and then there'll be one or two sets that are much
- * smaller).
- *
- * This log structure makes bcache's btree more of a hybrid between a
- * conventional btree and a compacting data structure, with some of the
- * advantages of both.
- *
- * GARBAGE COLLECTION:
- *
- * We can't just invalidate any bucket - it might contain dirty data or
- * metadata. If it once contained dirty data, other writes might overwrite it
- * later, leaving no valid pointers into that bucket in the index.
- *
- * Thus, the primary purpose of garbage collection is to find buckets to reuse.
- * It also counts how much valid data it each bucket currently contains, so that
- * allocation can reuse buckets sooner when they've been mostly overwritten.
- *
- * It also does some things that are really internal to the btree
- * implementation. If a btree node contains pointers that are stale by more than
- * some threshold, it rewrites the btree node to avoid the bucket's generation
- * wrapping around. It also merges adjacent btree nodes if they're empty enough.
- *
- * THE JOURNAL:
- *
- * Bcache's journal is not necessary for consistency; we always strictly
- * order metadata writes so that the btree and everything else is consistent on
- * disk in the event of an unclean shutdown, and in fact bcache had writeback
- * caching (with recovery from unclean shutdown) before journalling was
- * implemented.
- *
- * Rather, the journal is purely a performance optimization; we can't complete a
- * write until we've updated the index on disk, otherwise the cache would be
- * inconsistent in the event of an unclean shutdown. This means that without the
- * journal, on random write workloads we constantly have to update all the leaf
- * nodes in the btree, and those writes will be mostly empty (appending at most
- * a few keys each) - highly inefficient in terms of amount of metadata writes,
- * and it puts more strain on the various btree resorting/compacting code.
- *
- * The journal is just a log of keys we've inserted; on startup we just reinsert
- * all the keys in the open journal entries. That means that when we're updating
- * a node in the btree, we can wait until a 4k block of keys fills up before
- * writing them out.
- *
- * For simplicity, we only journal updates to leaf nodes; updates to parent
- * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
- * the complexity to deal with journalling them (in particular, journal replay)
- * - updates to non leaf nodes just happen synchronously (see btree_split()).
- */
-
-#undef pr_fmt
-#ifdef __KERNEL__
-#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
-#else
-#define pr_fmt(fmt) "%s() " fmt "\n", __func__
-#endif
-
-#include <linux/backing-dev-defs.h>
-#include <linux/bug.h>
-#include <linux/bio.h>
-#include <linux/closure.h>
-#include <linux/kobject.h>
-#include <linux/list.h>
-#include <linux/math64.h>
-#include <linux/mutex.h>
-#include <linux/percpu-refcount.h>
-#include <linux/percpu-rwsem.h>
-#include <linux/refcount.h>
-#include <linux/rhashtable.h>
-#include <linux/rwsem.h>
-#include <linux/semaphore.h>
-#include <linux/seqlock.h>
-#include <linux/shrinker.h>
-#include <linux/srcu.h>
-#include <linux/types.h>
-#include <linux/workqueue.h>
-#include <linux/zstd.h>
-
-#include "bcachefs_format.h"
-#include "errcode.h"
-#include "fifo.h"
-#include "nocow_locking_types.h"
-#include "opts.h"
-#include "recovery_passes_types.h"
-#include "sb-errors_types.h"
-#include "seqmutex.h"
-#include "time_stats.h"
-#include "util.h"
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define BCH_WRITE_REF_DEBUG
-#endif
-
-#ifndef dynamic_fault
-#define dynamic_fault(...)		0
-#endif
-
-#define race_fault(...)			dynamic_fault("bcachefs:race")
-
-#define count_event(_c, _name)	this_cpu_inc((_c)->counters[BCH_COUNTER_##_name])
-
-#define trace_and_count(_c, _name, ...)					\
-do {									\
-	count_event(_c, _name);						\
-	trace_##_name(__VA_ARGS__);					\
-} while (0)
-
-#define bch2_fs_init_fault(name)					\
-	dynamic_fault("bcachefs:bch_fs_init:" name)
-#define bch2_meta_read_fault(name)					\
-	 dynamic_fault("bcachefs:meta:read:" name)
-#define bch2_meta_write_fault(name)					\
-	 dynamic_fault("bcachefs:meta:write:" name)
-
-#ifdef __KERNEL__
-#define BCACHEFS_LOG_PREFIX
-#endif
-
-#ifdef BCACHEFS_LOG_PREFIX
-
-#define bch2_log_msg(_c, fmt)			"bcachefs (%s): " fmt, ((_c)->name)
-#define bch2_fmt_dev(_ca, fmt)			"bcachefs (%s): " fmt "\n", ((_ca)->name)
-#define bch2_fmt_dev_offset(_ca, _offset, fmt)	"bcachefs (%s sector %llu): " fmt "\n", ((_ca)->name), (_offset)
-#define bch2_fmt_inum(_c, _inum, fmt)		"bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
-#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt)			\
-	 "bcachefs (%s inum %llu offset %llu): " fmt "\n", ((_c)->name), (_inum), (_offset)
-
-#else
-
-#define bch2_log_msg(_c, fmt)			fmt
-#define bch2_fmt_dev(_ca, fmt)			"%s: " fmt "\n", ((_ca)->name)
-#define bch2_fmt_dev_offset(_ca, _offset, fmt)	"%s sector %llu: " fmt "\n", ((_ca)->name), (_offset)
-#define bch2_fmt_inum(_c, _inum, fmt)		"inum %llu: " fmt "\n", (_inum)
-#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt)				\
-	 "inum %llu offset %llu: " fmt "\n", (_inum), (_offset)
-
-#endif
-
-#define bch2_fmt(_c, fmt)		bch2_log_msg(_c, fmt "\n")
-
-__printf(2, 3)
-void bch2_print_opts(struct bch_opts *, const char *, ...);
-
-__printf(2, 3)
-void __bch2_print(struct bch_fs *c, const char *fmt, ...);
-
-#define maybe_dev_to_fs(_c)	_Generic((_c),				\
-	struct bch_dev *:	((struct bch_dev *) (_c))->fs,		\
-	struct bch_fs *:	(_c))
-
-#define bch2_print(_c, ...) __bch2_print(maybe_dev_to_fs(_c), __VA_ARGS__)
-
-#define bch2_print_ratelimited(_c, ...)					\
-do {									\
-	static DEFINE_RATELIMIT_STATE(_rs,				\
-				      DEFAULT_RATELIMIT_INTERVAL,	\
-				      DEFAULT_RATELIMIT_BURST);		\
-									\
-	if (__ratelimit(&_rs))						\
-		bch2_print(_c, __VA_ARGS__);				\
-} while (0)
-
-#define bch_info(c, fmt, ...) \
-	bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_notice(c, fmt, ...) \
-	bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_warn(c, fmt, ...) \
-	bch2_print(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_warn_ratelimited(c, fmt, ...) \
-	bch2_print_ratelimited(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
-
-#define bch_err(c, fmt, ...) \
-	bch2_print(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_err_dev(ca, fmt, ...) \
-	bch2_print(c, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
-#define bch_err_dev_offset(ca, _offset, fmt, ...) \
-	bch2_print(c, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
-#define bch_err_inum(c, _inum, fmt, ...) \
-	bch2_print(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
-#define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \
-	bch2_print(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
-
-#define bch_err_ratelimited(c, fmt, ...) \
-	bch2_print_ratelimited(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_err_dev_ratelimited(ca, fmt, ...) \
-	bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
-#define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \
-	bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
-#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
-	bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
-#define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \
-	bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
-
-static inline bool should_print_err(int err)
-{
-	return err && !bch2_err_matches(err, BCH_ERR_transaction_restart);
-}
-
-#define bch_err_fn(_c, _ret)						\
-do {									\
-	if (should_print_err(_ret))					\
-		bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
-} while (0)
-
-#define bch_err_fn_ratelimited(_c, _ret)				\
-do {									\
-	if (should_print_err(_ret))					\
-		bch_err_ratelimited(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
-} while (0)
-
-#define bch_err_msg(_c, _ret, _msg, ...)				\
-do {									\
-	if (should_print_err(_ret))					\
-		bch_err(_c, "%s(): error " _msg " %s", __func__,	\
-			##__VA_ARGS__, bch2_err_str(_ret));		\
-} while (0)
-
-#define bch_verbose(c, fmt, ...)					\
-do {									\
-	if ((c)->opts.verbose)						\
-		bch_info(c, fmt, ##__VA_ARGS__);			\
-} while (0)
-
-#define pr_verbose_init(opts, fmt, ...)					\
-do {									\
-	if (opt_get(opts, verbose))					\
-		pr_info(fmt, ##__VA_ARGS__);				\
-} while (0)
-
-/* Parameters that are useful for debugging, but should always be compiled in: */
-#define BCH_DEBUG_PARAMS_ALWAYS()					\
-	BCH_DEBUG_PARAM(key_merging_disabled,				\
-		"Disables merging of extents")				\
-	BCH_DEBUG_PARAM(btree_node_merging_disabled,			\
-		"Disables merging of btree nodes")			\
-	BCH_DEBUG_PARAM(btree_gc_always_rewrite,			\
-		"Causes mark and sweep to compact and rewrite every "	\
-		"btree node it traverses")				\
-	BCH_DEBUG_PARAM(btree_gc_rewrite_disabled,			\
-		"Disables rewriting of btree nodes during mark and sweep")\
-	BCH_DEBUG_PARAM(btree_shrinker_disabled,			\
-		"Disables the shrinker callback for the btree node cache")\
-	BCH_DEBUG_PARAM(verify_btree_ondisk,				\
-		"Reread btree nodes at various points to verify the "	\
-		"mergesort in the read path against modifications "	\
-		"done in memory")					\
-	BCH_DEBUG_PARAM(verify_all_btree_replicas,			\
-		"When reading btree nodes, read all replicas and "	\
-		"compare them")						\
-	BCH_DEBUG_PARAM(backpointers_no_use_write_buffer,		\
-		"Don't use the write buffer for backpointers, enabling "\
-		"extra runtime checks")
-
-/* Parameters that should only be compiled in debug mode: */
-#define BCH_DEBUG_PARAMS_DEBUG()					\
-	BCH_DEBUG_PARAM(expensive_debug_checks,				\
-		"Enables various runtime debugging checks that "	\
-		"significantly affect performance")			\
-	BCH_DEBUG_PARAM(debug_check_iterators,				\
-		"Enables extra verification for btree iterators")	\
-	BCH_DEBUG_PARAM(debug_check_btree_accounting,			\
-		"Verify btree accounting for keys within a node")	\
-	BCH_DEBUG_PARAM(journal_seq_verify,				\
-		"Store the journal sequence number in the version "	\
-		"number of every btree key, and verify that btree "	\
-		"update ordering is preserved during recovery")		\
-	BCH_DEBUG_PARAM(inject_invalid_keys,				\
-		"Store the journal sequence number in the version "	\
-		"number of every btree key, and verify that btree "	\
-		"update ordering is preserved during recovery")		\
-	BCH_DEBUG_PARAM(test_alloc_startup,				\
-		"Force allocator startup to use the slowpath where it"	\
-		"can't find enough free buckets without invalidating"	\
-		"cached data")						\
-	BCH_DEBUG_PARAM(force_reconstruct_read,				\
-		"Force reads to use the reconstruct path, when reading"	\
-		"from erasure coded extents")				\
-	BCH_DEBUG_PARAM(test_restart_gc,				\
-		"Test restarting mark and sweep gc when bucket gens change")
-
-#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL()
-#else
-#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
-#endif
-
-#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
-BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
-#ifndef CONFIG_BCACHEFS_DEBUG
-#define BCH_DEBUG_PARAM(name, description) static const __maybe_unused bool bch2_##name;
-BCH_DEBUG_PARAMS_DEBUG()
-#undef BCH_DEBUG_PARAM
-#endif
-
-#define BCH_TIME_STATS()			\
-	x(btree_node_mem_alloc)			\
-	x(btree_node_split)			\
-	x(btree_node_compact)			\
-	x(btree_node_merge)			\
-	x(btree_node_sort)			\
-	x(btree_node_read)			\
-	x(btree_node_read_done)			\
-	x(btree_interior_update_foreground)	\
-	x(btree_interior_update_total)		\
-	x(btree_gc)				\
-	x(data_write)				\
-	x(data_read)				\
-	x(data_promote)				\
-	x(journal_flush_write)			\
-	x(journal_noflush_write)		\
-	x(journal_flush_seq)			\
-	x(blocked_journal_low_on_space)		\
-	x(blocked_journal_low_on_pin)		\
-	x(blocked_journal_max_in_flight)	\
-	x(blocked_allocate)			\
-	x(blocked_allocate_open_bucket)		\
-	x(blocked_write_buffer_full)		\
-	x(nocow_lock_contended)
-
-enum bch_time_stats {
-#define x(name) BCH_TIME_##name,
-	BCH_TIME_STATS()
-#undef x
-	BCH_TIME_STAT_NR
-};
-
-#include "alloc_types.h"
-#include "btree_types.h"
-#include "btree_node_scan_types.h"
-#include "btree_write_buffer_types.h"
-#include "buckets_types.h"
-#include "buckets_waiting_for_journal_types.h"
-#include "clock_types.h"
-#include "disk_groups_types.h"
-#include "ec_types.h"
-#include "journal_types.h"
-#include "keylist_types.h"
-#include "quota_types.h"
-#include "rebalance_types.h"
-#include "replicas_types.h"
-#include "sb-members_types.h"
-#include "subvolume_types.h"
-#include "super_types.h"
-#include "thread_with_file_types.h"
-
-/* Number of nodes btree coalesce will try to coalesce at once */
-#define GC_MERGE_NODES		4U
-
-/* Maximum number of nodes we might need to allocate atomically: */
-#define BTREE_RESERVE_MAX	(BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1))
-
-/* Size of the freelist we allocate btree nodes from: */
-#define BTREE_NODE_RESERVE	(BTREE_RESERVE_MAX * 4)
-
-#define BTREE_NODE_OPEN_BUCKET_RESERVE	(BTREE_RESERVE_MAX * BCH_REPLICAS_MAX)
-
-struct btree;
-
-enum gc_phase {
-	GC_PHASE_NOT_RUNNING,
-	GC_PHASE_START,
-	GC_PHASE_SB,
-
-	GC_PHASE_BTREE_stripes,
-	GC_PHASE_BTREE_extents,
-	GC_PHASE_BTREE_inodes,
-	GC_PHASE_BTREE_dirents,
-	GC_PHASE_BTREE_xattrs,
-	GC_PHASE_BTREE_alloc,
-	GC_PHASE_BTREE_quotas,
-	GC_PHASE_BTREE_reflink,
-	GC_PHASE_BTREE_subvolumes,
-	GC_PHASE_BTREE_snapshots,
-	GC_PHASE_BTREE_lru,
-	GC_PHASE_BTREE_freespace,
-	GC_PHASE_BTREE_need_discard,
-	GC_PHASE_BTREE_backpointers,
-	GC_PHASE_BTREE_bucket_gens,
-	GC_PHASE_BTREE_snapshot_trees,
-	GC_PHASE_BTREE_deleted_inodes,
-	GC_PHASE_BTREE_logged_ops,
-	GC_PHASE_BTREE_rebalance_work,
-	GC_PHASE_BTREE_subvolume_children,
-
-	GC_PHASE_PENDING_DELETE,
-};
-
-struct gc_pos {
-	enum gc_phase		phase;
-	u16			level;
-	struct bpos		pos;
-};
-
-struct reflink_gc {
-	u64		offset;
-	u32		size;
-	u32		refcount;
-};
-
-typedef GENRADIX(struct reflink_gc) reflink_gc_table;
-
-struct io_count {
-	u64			sectors[2][BCH_DATA_NR];
-};
-
-struct bch_dev {
-	struct kobject		kobj;
-#ifdef CONFIG_BCACHEFS_DEBUG
-	atomic_long_t		ref;
-	bool			dying;
-	unsigned long		last_put;
-#else
-	struct percpu_ref	ref;
-#endif
-	struct completion	ref_completion;
-	struct percpu_ref	io_ref;
-	struct completion	io_ref_completion;
-
-	struct bch_fs		*fs;
-
-	u8			dev_idx;
-	/*
-	 * Cached version of this device's member info from superblock
-	 * Committed by bch2_write_super() -> bch_fs_mi_update()
-	 */
-	struct bch_member_cpu	mi;
-	atomic64_t		errors[BCH_MEMBER_ERROR_NR];
-
-	__uuid_t		uuid;
-	char			name[BDEVNAME_SIZE];
-
-	struct bch_sb_handle	disk_sb;
-	struct bch_sb		*sb_read_scratch;
-	int			sb_write_error;
-	dev_t			dev;
-	atomic_t		flush_seq;
-
-	struct bch_devs_mask	self;
-
-	/*
-	 * Buckets:
-	 * Per-bucket arrays are protected by c->mark_lock, bucket_lock and
-	 * gc_lock, for device resize - holding any is sufficient for access:
-	 * Or rcu_read_lock(), but only for dev_ptr_stale():
-	 */
-	struct bucket_array __rcu *buckets_gc;
-	struct bucket_gens __rcu *bucket_gens;
-	u8			*oldest_gen;
-	unsigned long		*buckets_nouse;
-	struct rw_semaphore	bucket_lock;
-
-	struct bch_dev_usage		*usage_base;
-	struct bch_dev_usage __percpu	*usage[JOURNAL_BUF_NR];
-	struct bch_dev_usage __percpu	*usage_gc;
-
-	/* Allocator: */
-	u64			new_fs_bucket_idx;
-	u64			alloc_cursor[3];
-
-	unsigned		nr_open_buckets;
-	unsigned		nr_btree_reserve;
-
-	size_t			inc_gen_needs_gc;
-	size_t			inc_gen_really_needs_gc;
-	size_t			buckets_waiting_on_journal;
-
-	atomic64_t		rebalance_work;
-
-	struct journal_device	journal;
-	u64			prev_journal_sector;
-
-	struct work_struct	io_error_work;
-
-	/* The rest of this all shows up in sysfs */
-	atomic64_t		cur_latency[2];
-	struct bch2_time_stats_quantiles io_latency[2];
-
-#define CONGESTED_MAX		1024
-	atomic_t		congested;
-	u64			congested_last;
-
-	struct io_count __percpu *io_done;
-};
-
-/*
- * initial_gc_unfixed
- * error
- * topology error
- */
-
-#define BCH_FS_FLAGS()			\
-	x(new_fs)			\
-	x(started)			\
-	x(may_go_rw)			\
-	x(rw)				\
-	x(was_rw)			\
-	x(stopping)			\
-	x(emergency_ro)			\
-	x(going_ro)			\
-	x(write_disable_complete)	\
-	x(clean_shutdown)		\
-	x(fsck_running)			\
-	x(initial_gc_unfixed)		\
-	x(need_delete_dead_snapshots)	\
-	x(error)			\
-	x(topology_error)		\
-	x(errors_fixed)			\
-	x(errors_not_fixed)		\
-	x(no_invalid_checks)
-
-enum bch_fs_flags {
-#define x(n)		BCH_FS_##n,
-	BCH_FS_FLAGS()
-#undef x
-};
-
-struct btree_debug {
-	unsigned		id;
-};
-
-#define BCH_TRANSACTIONS_NR 128
-
-struct btree_transaction_stats {
-	struct bch2_time_stats	duration;
-	struct bch2_time_stats	lock_hold_times;
-	struct mutex		lock;
-	unsigned		nr_max_paths;
-	unsigned		journal_entries_size;
-	unsigned		max_mem;
-	char			*max_paths_text;
-};
-
-struct bch_fs_pcpu {
-	u64			sectors_available;
-};
-
-struct journal_seq_blacklist_table {
-	size_t			nr;
-	struct journal_seq_blacklist_table_entry {
-		u64		start;
-		u64		end;
-		bool		dirty;
-	}			entries[];
-};
-
-struct journal_keys {
-	/* must match layout in darray_types.h */
-	size_t			nr, size;
-	struct journal_key {
-		u64		journal_seq;
-		u32		journal_offset;
-		enum btree_id	btree_id:8;
-		unsigned	level:8;
-		bool		allocated;
-		bool		overwritten;
-		struct bkey_i	*k;
-	}			*data;
-	/*
-	 * Gap buffer: instead of all the empty space in the array being at the
-	 * end of the buffer - from @nr to @size - the empty space is at @gap.
-	 * This means that sequential insertions are O(n) instead of O(n^2).
-	 */
-	size_t			gap;
-	atomic_t		ref;
-	bool			initial_ref_held;
-};
-
-struct btree_trans_buf {
-	struct btree_trans	*trans;
-};
-
-#define REPLICAS_DELTA_LIST_MAX	(1U << 16)
-
-#define BCACHEFS_ROOT_SUBVOL_INUM					\
-	((subvol_inum) { BCACHEFS_ROOT_SUBVOL,	BCACHEFS_ROOT_INO })
-
-#define BCH_WRITE_REFS()						\
-	x(trans)							\
-	x(write)							\
-	x(promote)							\
-	x(node_rewrite)							\
-	x(stripe_create)						\
-	x(stripe_delete)						\
-	x(reflink)							\
-	x(fallocate)							\
-	x(fsync)							\
-	x(dio_write)							\
-	x(discard)							\
-	x(discard_fast)							\
-	x(invalidate)							\
-	x(delete_dead_snapshots)					\
-	x(gc_gens)							\
-	x(snapshot_delete_pagecache)					\
-	x(sysfs)							\
-	x(btree_write_buffer)
-
-enum bch_write_ref {
-#define x(n) BCH_WRITE_REF_##n,
-	BCH_WRITE_REFS()
-#undef x
-	BCH_WRITE_REF_NR,
-};
-
-struct bch_fs {
-	struct closure		cl;
-
-	struct list_head	list;
-	struct kobject		kobj;
-	struct kobject		counters_kobj;
-	struct kobject		internal;
-	struct kobject		opts_dir;
-	struct kobject		time_stats;
-	unsigned long		flags;
-
-	int			minor;
-	struct device		*chardev;
-	struct super_block	*vfs_sb;
-	dev_t			dev;
-	char			name[40];
-	struct stdio_redirect	*stdio;
-	struct task_struct	*stdio_filter;
-
-	/* ro/rw, add/remove/resize devices: */
-	struct rw_semaphore	state_lock;
-
-	/* Counts outstanding writes, for clean transition to read-only */
-#ifdef BCH_WRITE_REF_DEBUG
-	atomic_long_t		writes[BCH_WRITE_REF_NR];
-#else
-	struct percpu_ref	writes;
-#endif
-	/*
-	 * Analagous to c->writes, for asynchronous ops that don't necessarily
-	 * need fs to be read-write
-	 */
-	refcount_t		ro_ref;
-	wait_queue_head_t	ro_ref_wait;
-
-	struct work_struct	read_only_work;
-
-	struct bch_dev __rcu	*devs[BCH_SB_MEMBERS_MAX];
-
-	struct bch_replicas_cpu replicas;
-	struct bch_replicas_cpu replicas_gc;
-	struct mutex		replicas_gc_lock;
-	mempool_t		replicas_delta_pool;
-
-	struct journal_entry_res btree_root_journal_res;
-	struct journal_entry_res replicas_journal_res;
-	struct journal_entry_res clock_journal_res;
-	struct journal_entry_res dev_usage_journal_res;
-
-	struct bch_disk_groups_cpu __rcu *disk_groups;
-
-	struct bch_opts		opts;
-
-	/* Updated by bch2_sb_update():*/
-	struct {
-		__uuid_t	uuid;
-		__uuid_t	user_uuid;
-
-		u16		version;
-		u16		version_min;
-		u16		version_upgrade_complete;
-
-		u8		nr_devices;
-		u8		clean;
-
-		u8		encryption_type;
-
-		u64		time_base_lo;
-		u32		time_base_hi;
-		unsigned	time_units_per_sec;
-		unsigned	nsec_per_time_unit;
-		u64		features;
-		u64		compat;
-		unsigned long	errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)];
-		u64		btrees_lost_data;
-	}			sb;
-
-
-	struct bch_sb_handle	disk_sb;
-
-	unsigned short		block_bits;	/* ilog2(block_size) */
-
-	u16			btree_foreground_merge_threshold;
-
-	struct closure		sb_write;
-	struct mutex		sb_lock;
-
-	/* snapshot.c: */
-	struct snapshot_table __rcu *snapshots;
-	struct mutex		snapshot_table_lock;
-	struct rw_semaphore	snapshot_create_lock;
-
-	struct work_struct	snapshot_delete_work;
-	struct work_struct	snapshot_wait_for_pagecache_and_delete_work;
-	snapshot_id_list	snapshots_unlinked;
-	struct mutex		snapshots_unlinked_lock;
-
-	/* BTREE CACHE */
-	struct bio_set		btree_bio;
-	struct workqueue_struct	*io_complete_wq;
-
-	struct btree_root	btree_roots_known[BTREE_ID_NR];
-	DARRAY(struct btree_root) btree_roots_extra;
-	struct mutex		btree_root_lock;
-
-	struct btree_cache	btree_cache;
-
-	/*
-	 * Cache of allocated btree nodes - if we allocate a btree node and
-	 * don't use it, if we free it that space can't be reused until going
-	 * _all_ the way through the allocator (which exposes us to a livelock
-	 * when allocating btree reserves fail halfway through) - instead, we
-	 * can stick them here:
-	 */
-	struct btree_alloc	btree_reserve_cache[BTREE_NODE_RESERVE * 2];
-	unsigned		btree_reserve_cache_nr;
-	struct mutex		btree_reserve_cache_lock;
-
-	mempool_t		btree_interior_update_pool;
-	struct list_head	btree_interior_update_list;
-	struct list_head	btree_interior_updates_unwritten;
-	struct mutex		btree_interior_update_lock;
-	struct closure_waitlist	btree_interior_update_wait;
-
-	struct workqueue_struct	*btree_interior_update_worker;
-	struct work_struct	btree_interior_update_work;
-
-	struct workqueue_struct	*btree_node_rewrite_worker;
-
-	struct list_head	pending_node_rewrites;
-	struct mutex		pending_node_rewrites_lock;
-
-	/* btree_io.c: */
-	spinlock_t		btree_write_error_lock;
-	struct btree_write_stats {
-		atomic64_t	nr;
-		atomic64_t	bytes;
-	}			btree_write_stats[BTREE_WRITE_TYPE_NR];
-
-	/* btree_iter.c: */
-	struct seqmutex		btree_trans_lock;
-	struct list_head	btree_trans_list;
-	mempool_t		btree_trans_pool;
-	mempool_t		btree_trans_mem_pool;
-	struct btree_trans_buf  __percpu	*btree_trans_bufs;
-
-	struct srcu_struct	btree_trans_barrier;
-	bool			btree_trans_barrier_initialized;
-
-	struct btree_key_cache	btree_key_cache;
-	unsigned		btree_key_cache_btrees;
-
-	struct btree_write_buffer btree_write_buffer;
-
-	struct workqueue_struct	*btree_update_wq;
-	struct workqueue_struct	*btree_io_complete_wq;
-	/* copygc needs its own workqueue for index updates.. */
-	struct workqueue_struct	*copygc_wq;
-	/*
-	 * Use a dedicated wq for write ref holder tasks. Required to avoid
-	 * dependency problems with other wq tasks that can block on ref
-	 * draining, such as read-only transition.
-	 */
-	struct workqueue_struct *write_ref_wq;
-
-	/* ALLOCATION */
-	struct bch_devs_mask	rw_devs[BCH_DATA_NR];
-
-	u64			capacity; /* sectors */
-
-	/*
-	 * When capacity _decreases_ (due to a disk being removed), we
-	 * increment capacity_gen - this invalidates outstanding reservations
-	 * and forces them to be revalidated
-	 */
-	u32			capacity_gen;
-	unsigned		bucket_size_max;
-
-	atomic64_t		sectors_available;
-	struct mutex		sectors_available_lock;
-
-	struct bch_fs_pcpu __percpu	*pcpu;
-
-	struct percpu_rw_semaphore	mark_lock;
-
-	seqcount_t			usage_lock;
-	struct bch_fs_usage		*usage_base;
-	struct bch_fs_usage __percpu	*usage[JOURNAL_BUF_NR];
-	struct bch_fs_usage __percpu	*usage_gc;
-	u64 __percpu		*online_reserved;
-
-	/* single element mempool: */
-	struct mutex		usage_scratch_lock;
-	struct bch_fs_usage_online *usage_scratch;
-
-	struct io_clock		io_clock[2];
-
-	/* JOURNAL SEQ BLACKLIST */
-	struct journal_seq_blacklist_table *
-				journal_seq_blacklist_table;
-
-	/* ALLOCATOR */
-	spinlock_t		freelist_lock;
-	struct closure_waitlist	freelist_wait;
-
-	open_bucket_idx_t	open_buckets_freelist;
-	open_bucket_idx_t	open_buckets_nr_free;
-	struct closure_waitlist	open_buckets_wait;
-	struct open_bucket	open_buckets[OPEN_BUCKETS_COUNT];
-	open_bucket_idx_t	open_buckets_hash[OPEN_BUCKETS_COUNT];
-
-	open_bucket_idx_t	open_buckets_partial[OPEN_BUCKETS_COUNT];
-	open_bucket_idx_t	open_buckets_partial_nr;
-
-	struct write_point	btree_write_point;
-	struct write_point	rebalance_write_point;
-
-	struct write_point	write_points[WRITE_POINT_MAX];
-	struct hlist_head	write_points_hash[WRITE_POINT_HASH_NR];
-	struct mutex		write_points_hash_lock;
-	unsigned		write_points_nr;
-
-	struct buckets_waiting_for_journal buckets_waiting_for_journal;
-	struct work_struct	invalidate_work;
-	struct work_struct	discard_work;
-	struct mutex		discard_buckets_in_flight_lock;
-	DARRAY(struct bpos)	discard_buckets_in_flight;
-	struct work_struct	discard_fast_work;
-
-	/* GARBAGE COLLECTION */
-	struct work_struct	gc_gens_work;
-	unsigned long		gc_count;
-
-	enum btree_id		gc_gens_btree;
-	struct bpos		gc_gens_pos;
-
-	/*
-	 * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
-	 * has been marked by GC.
-	 *
-	 * gc_cur_phase is a superset of btree_ids (BTREE_ID_extents etc.)
-	 *
-	 * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
-	 * can read without a lock.
-	 */
-	seqcount_t		gc_pos_lock;
-	struct gc_pos		gc_pos;
-
-	/*
-	 * The allocation code needs gc_mark in struct bucket to be correct, but
-	 * it's not while a gc is in progress.
-	 */
-	struct rw_semaphore	gc_lock;
-	struct mutex		gc_gens_lock;
-
-	/* IO PATH */
-	struct semaphore	io_in_flight;
-	struct bio_set		bio_read;
-	struct bio_set		bio_read_split;
-	struct bio_set		bio_write;
-	struct bio_set		replica_set;
-	struct mutex		bio_bounce_pages_lock;
-	mempool_t		bio_bounce_pages;
-	struct bucket_nocow_lock_table
-				nocow_locks;
-	struct rhashtable	promote_table;
-
-	mempool_t		compression_bounce[2];
-	mempool_t		compress_workspace[BCH_COMPRESSION_TYPE_NR];
-	mempool_t		decompress_workspace;
-	size_t			zstd_workspace_size;
-
-	struct crypto_shash	*sha256;
-	struct crypto_sync_skcipher *chacha20;
-	struct crypto_shash	*poly1305;
-
-	atomic64_t		key_version;
-
-	mempool_t		large_bkey_pool;
-
-	/* MOVE.C */
-	struct list_head	moving_context_list;
-	struct mutex		moving_context_lock;
-
-	/* REBALANCE */
-	struct bch_fs_rebalance	rebalance;
-
-	/* COPYGC */
-	struct task_struct	*copygc_thread;
-	struct write_point	copygc_write_point;
-	s64			copygc_wait_at;
-	s64			copygc_wait;
-	bool			copygc_running;
-	wait_queue_head_t	copygc_running_wq;
-
-	/* STRIPES: */
-	GENRADIX(struct stripe) stripes;
-	GENRADIX(struct gc_stripe) gc_stripes;
-
-	struct hlist_head	ec_stripes_new[32];
-	spinlock_t		ec_stripes_new_lock;
-
-	ec_stripes_heap		ec_stripes_heap;
-	struct mutex		ec_stripes_heap_lock;
-
-	/* ERASURE CODING */
-	struct list_head	ec_stripe_head_list;
-	struct mutex		ec_stripe_head_lock;
-
-	struct list_head	ec_stripe_new_list;
-	struct mutex		ec_stripe_new_lock;
-	wait_queue_head_t	ec_stripe_new_wait;
-
-	struct work_struct	ec_stripe_create_work;
-	u64			ec_stripe_hint;
-
-	struct work_struct	ec_stripe_delete_work;
-
-	struct bio_set		ec_bioset;
-
-	/* REFLINK */
-	reflink_gc_table	reflink_gc_table;
-	size_t			reflink_gc_nr;
-
-	/* fs.c */
-	struct list_head	vfs_inodes_list;
-	struct mutex		vfs_inodes_lock;
-
-	/* VFS IO PATH - fs-io.c */
-	struct bio_set		writepage_bioset;
-	struct bio_set		dio_write_bioset;
-	struct bio_set		dio_read_bioset;
-	struct bio_set		nocow_flush_bioset;
-
-	/* QUOTAS */
-	struct bch_memquota_type quotas[QTYP_NR];
-
-	/* RECOVERY */
-	u64			journal_replay_seq_start;
-	u64			journal_replay_seq_end;
-	/*
-	 * Two different uses:
-	 * "Has this fsck pass?" - i.e. should this type of error be an
-	 * emergency read-only
-	 * And, in certain situations fsck will rewind to an earlier pass: used
-	 * for signaling to the toplevel code which pass we want to run now.
-	 */
-	enum bch_recovery_pass	curr_recovery_pass;
-	/* bitmap of explicitly enabled recovery passes: */
-	u64			recovery_passes_explicit;
-	/* bitmask of recovery passes that we actually ran */
-	u64			recovery_passes_complete;
-	/* never rewinds version of curr_recovery_pass */
-	enum bch_recovery_pass	recovery_pass_done;
-	struct semaphore	online_fsck_mutex;
-
-	/* DEBUG JUNK */
-	struct dentry		*fs_debug_dir;
-	struct dentry		*btree_debug_dir;
-	struct btree_debug	btree_debug[BTREE_ID_NR];
-	struct btree		*verify_data;
-	struct btree_node	*verify_ondisk;
-	struct mutex		verify_lock;
-
-	u64			*unused_inode_hints;
-	unsigned		inode_shard_bits;
-
-	/*
-	 * A btree node on disk could have too many bsets for an iterator to fit
-	 * on the stack - have to dynamically allocate them
-	 */
-	mempool_t		fill_iter;
-
-	mempool_t		btree_bounce_pool;
-
-	struct journal		journal;
-	GENRADIX(struct journal_replay *) journal_entries;
-	u64			journal_entries_base_seq;
-	struct journal_keys	journal_keys;
-	struct list_head	journal_iters;
-
-	struct find_btree_nodes	found_btree_nodes;
-
-	u64			last_bucket_seq_cleanup;
-
-	u64			counters_on_mount[BCH_COUNTER_NR];
-	u64 __percpu		*counters;
-
-	unsigned		copy_gc_enabled:1;
-	bool			promote_whole_extents;
-
-	struct bch2_time_stats	times[BCH_TIME_STAT_NR];
-
-	struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
-
-	/* ERRORS */
-	struct list_head	fsck_error_msgs;
-	struct mutex		fsck_error_msgs_lock;
-	bool			fsck_alloc_msgs_err;
-
-	bch_sb_errors_cpu	fsck_error_counts;
-	struct mutex		fsck_error_counts_lock;
-};
-
-extern struct wait_queue_head bch2_read_only_wait;
-
-static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref)
-{
-#ifdef BCH_WRITE_REF_DEBUG
-	atomic_long_inc(&c->writes[ref]);
-#else
-	percpu_ref_get(&c->writes);
-#endif
-}
-
-static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
-{
-#ifdef BCH_WRITE_REF_DEBUG
-	return !test_bit(BCH_FS_going_ro, &c->flags) &&
-		atomic_long_inc_not_zero(&c->writes[ref]);
-#else
-	return percpu_ref_tryget(&c->writes);
-#endif
-}
-
-static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
-{
-#ifdef BCH_WRITE_REF_DEBUG
-	return !test_bit(BCH_FS_going_ro, &c->flags) &&
-		atomic_long_inc_not_zero(&c->writes[ref]);
-#else
-	return percpu_ref_tryget_live(&c->writes);
-#endif
-}
-
-static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref)
-{
-#ifdef BCH_WRITE_REF_DEBUG
-	long v = atomic_long_dec_return(&c->writes[ref]);
-
-	BUG_ON(v < 0);
-	if (v)
-		return;
-	for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
-		if (atomic_long_read(&c->writes[i]))
-			return;
-
-	set_bit(BCH_FS_write_disable_complete, &c->flags);
-	wake_up(&bch2_read_only_wait);
-#else
-	percpu_ref_put(&c->writes);
-#endif
-}
-
-static inline bool bch2_ro_ref_tryget(struct bch_fs *c)
-{
-	if (test_bit(BCH_FS_stopping, &c->flags))
-		return false;
-
-	return refcount_inc_not_zero(&c->ro_ref);
-}
-
-static inline void bch2_ro_ref_put(struct bch_fs *c)
-{
-	if (refcount_dec_and_test(&c->ro_ref))
-		wake_up(&c->ro_ref_wait);
-}
-
-static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
-{
-#ifndef NO_BCACHEFS_FS
-	if (c->vfs_sb)
-		c->vfs_sb->s_bdi->ra_pages = ra_pages;
-#endif
-}
-
-static inline unsigned bucket_bytes(const struct bch_dev *ca)
-{
-	return ca->mi.bucket_size << 9;
-}
-
-static inline unsigned block_bytes(const struct bch_fs *c)
-{
-	return c->opts.block_size;
-}
-
-static inline unsigned block_sectors(const struct bch_fs *c)
-{
-	return c->opts.block_size >> 9;
-}
-
-static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
-{
-	return c->btree_key_cache_btrees & (1U << btree);
-}
-
-static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
-{
-	struct timespec64 t;
-	s32 rem;
-
-	time += c->sb.time_base_lo;
-
-	t.tv_sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
-	t.tv_nsec = rem * c->sb.nsec_per_time_unit;
-	return t;
-}
-
-static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts)
-{
-	return (ts.tv_sec * c->sb.time_units_per_sec +
-		(int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo;
-}
-
-static inline s64 bch2_current_time(const struct bch_fs *c)
-{
-	struct timespec64 now;
-
-	ktime_get_coarse_real_ts64(&now);
-	return timespec_to_bch2_time(c, now);
-}
-
-static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c)
-{
-	struct stdio_redirect *stdio = c->stdio;
-
-	if (c->stdio_filter && c->stdio_filter != current)
-		stdio = NULL;
-	return stdio;
-}
-
-static inline unsigned metadata_replicas_required(struct bch_fs *c)
-{
-	return min(c->opts.metadata_replicas,
-		   c->opts.metadata_replicas_required);
-}
-
-static inline unsigned data_replicas_required(struct bch_fs *c)
-{
-	return min(c->opts.data_replicas,
-		   c->opts.data_replicas_required);
-}
-
-#define BKEY_PADDED_ONSTACK(key, pad)				\
-	struct { struct bkey_i key; __u64 key ## _pad[pad]; }
-
-#endif /* _BCACHEFS_H */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
deleted file mode 100644
index 1bebba881d89..000000000000
--- a/fs/bcachefs/bcachefs_format.h
+++ /dev/null
@@ -1,1667 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FORMAT_H
-#define _BCACHEFS_FORMAT_H
-
-/*
- * bcachefs on disk data structures
- *
- * OVERVIEW:
- *
- * There are three main types of on disk data structures in bcachefs (this is
- * reduced from 5 in bcache)
- *
- *  - superblock
- *  - journal
- *  - btree
- *
- * The btree is the primary structure; most metadata exists as keys in the
- * various btrees. There are only a small number of btrees, they're not
- * sharded - we have one btree for extents, another for inodes, et cetera.
- *
- * SUPERBLOCK:
- *
- * The superblock contains the location of the journal, the list of devices in
- * the filesystem, and in general any metadata we need in order to decide
- * whether we can start a filesystem or prior to reading the journal/btree
- * roots.
- *
- * The superblock is extensible, and most of the contents of the superblock are
- * in variable length, type tagged fields; see struct bch_sb_field.
- *
- * Backup superblocks do not reside in a fixed location; also, superblocks do
- * not have a fixed size. To locate backup superblocks we have struct
- * bch_sb_layout; we store a copy of this inside every superblock, and also
- * before the first superblock.
- *
- * JOURNAL:
- *
- * The journal primarily records btree updates in the order they occurred;
- * journal replay consists of just iterating over all the keys in the open
- * journal entries and re-inserting them into the btrees.
- *
- * The journal also contains entry types for the btree roots, and blacklisted
- * journal sequence numbers (see journal_seq_blacklist.c).
- *
- * BTREE:
- *
- * bcachefs btrees are copy on write b+ trees, where nodes are big (typically
- * 128k-256k) and log structured. We use struct btree_node for writing the first
- * entry in a given node (offset 0), and struct btree_node_entry for all
- * subsequent writes.
- *
- * After the header, btree node entries contain a list of keys in sorted order.
- * Values are stored inline with the keys; since values are variable length (and
- * keys effectively are variable length too, due to packing) we can't do random
- * access without building up additional in memory tables in the btree node read
- * path.
- *
- * BTREE KEYS (struct bkey):
- *
- * The various btrees share a common format for the key - so as to avoid
- * switching in fastpath lookup/comparison code - but define their own
- * structures for the key values.
- *
- * The size of a key/value pair is stored as a u8 in units of u64s, so the max
- * size is just under 2k. The common part also contains a type tag for the
- * value, and a format field indicating whether the key is packed or not (and
- * also meant to allow adding new key fields in the future, if desired).
- *
- * bkeys, when stored within a btree node, may also be packed. In that case, the
- * bkey_format in that node is used to unpack it. Packed bkeys mean that we can
- * be generous with field sizes in the common part of the key format (64 bit
- * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
- */
-
-#include <asm/types.h>
-#include <asm/byteorder.h>
-#include <linux/kernel.h>
-#include <linux/uuid.h>
-#include <uapi/linux/magic.h>
-#include "vstructs.h"
-
-#ifdef __KERNEL__
-typedef uuid_t __uuid_t;
-#endif
-
-#define BITMASK(name, type, field, offset, end)				\
-static const __maybe_unused unsigned	name##_OFFSET = offset;		\
-static const __maybe_unused unsigned	name##_BITS = (end - offset);	\
-									\
-static inline __u64 name(const type *k)					\
-{									\
-	return (k->field >> offset) & ~(~0ULL << (end - offset));	\
-}									\
-									\
-static inline void SET_##name(type *k, __u64 v)				\
-{									\
-	k->field &= ~(~(~0ULL << (end - offset)) << offset);		\
-	k->field |= (v & ~(~0ULL << (end - offset))) << offset;		\
-}
-
-#define LE_BITMASK(_bits, name, type, field, offset, end)		\
-static const __maybe_unused unsigned	name##_OFFSET = offset;		\
-static const __maybe_unused unsigned	name##_BITS = (end - offset);	\
-static const __maybe_unused __u##_bits	name##_MAX = (1ULL << (end - offset)) - 1;\
-									\
-static inline __u64 name(const type *k)					\
-{									\
-	return (__le##_bits##_to_cpu(k->field) >> offset) &		\
-		~(~0ULL << (end - offset));				\
-}									\
-									\
-static inline void SET_##name(type *k, __u64 v)				\
-{									\
-	__u##_bits new = __le##_bits##_to_cpu(k->field);		\
-									\
-	new &= ~(~(~0ULL << (end - offset)) << offset);			\
-	new |= (v & ~(~0ULL << (end - offset))) << offset;		\
-	k->field = __cpu_to_le##_bits(new);				\
-}
-
-#define LE16_BITMASK(n, t, f, o, e)	LE_BITMASK(16, n, t, f, o, e)
-#define LE32_BITMASK(n, t, f, o, e)	LE_BITMASK(32, n, t, f, o, e)
-#define LE64_BITMASK(n, t, f, o, e)	LE_BITMASK(64, n, t, f, o, e)
-
-struct bkey_format {
-	__u8		key_u64s;
-	__u8		nr_fields;
-	/* One unused slot for now: */
-	__u8		bits_per_field[6];
-	__le64		field_offset[6];
-};
-
-/* Btree keys - all units are in sectors */
-
-struct bpos {
-	/*
-	 * Word order matches machine byte order - btree code treats a bpos as a
-	 * single large integer, for search/comparison purposes
-	 *
-	 * Note that wherever a bpos is embedded in another on disk data
-	 * structure, it has to be byte swabbed when reading in metadata that
-	 * wasn't written in native endian order:
-	 */
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-	__u32		snapshot;
-	__u64		offset;
-	__u64		inode;
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-	__u64		inode;
-	__u64		offset;		/* Points to end of extent - sectors */
-	__u32		snapshot;
-#else
-#error edit for your odd byteorder.
-#endif
-} __packed
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-__aligned(4)
-#endif
-;
-
-#define KEY_INODE_MAX			((__u64)~0ULL)
-#define KEY_OFFSET_MAX			((__u64)~0ULL)
-#define KEY_SNAPSHOT_MAX		((__u32)~0U)
-#define KEY_SIZE_MAX			((__u32)~0U)
-
-static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
-{
-	return (struct bpos) {
-		.inode		= inode,
-		.offset		= offset,
-		.snapshot	= snapshot,
-	};
-}
-
-#define POS_MIN				SPOS(0, 0, 0)
-#define POS_MAX				SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0)
-#define SPOS_MAX			SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
-#define POS(_inode, _offset)		SPOS(_inode, _offset, 0)
-
-/* Empty placeholder struct, for container_of() */
-struct bch_val {
-	__u64		__nothing[0];
-};
-
-struct bversion {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-	__u64		lo;
-	__u32		hi;
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-	__u32		hi;
-	__u64		lo;
-#endif
-} __packed
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-__aligned(4)
-#endif
-;
-
-struct bkey {
-	/* Size of combined key and value, in u64s */
-	__u8		u64s;
-
-	/* Format of key (0 for format local to btree node) */
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u8		format:7,
-			needs_whiteout:1;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u8		needs_whiteout:1,
-			format:7;
-#else
-#error edit for your odd byteorder.
-#endif
-
-	/* Type of the value */
-	__u8		type;
-
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-	__u8		pad[1];
-
-	struct bversion	version;
-	__u32		size;		/* extent size, in sectors */
-	struct bpos	p;
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-	struct bpos	p;
-	__u32		size;		/* extent size, in sectors */
-	struct bversion	version;
-
-	__u8		pad[1];
-#endif
-} __packed
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-/*
- * The big-endian version of bkey can't be compiled by rustc with the "aligned"
- * attr since it doesn't allow types to have both "packed" and "aligned" attrs.
- * So for Rust compatibility, don't include this. It can be included in the LE
- * version because the "packed" attr is redundant in that case.
- *
- * History: (quoting Kent)
- *
- * Specifically, when i was designing bkey, I wanted the header to be no
- * bigger than necessary so that bkey_packed could use the rest. That means that
- * decently offten extent keys will fit into only 8 bytes, instead of spilling over
- * to 16.
- *
- * But packed_bkey treats the part after the header - the packed section -
- * as a single multi word, variable length integer. And bkey, the unpacked
- * version, is just a special case version of a bkey_packed; all the packed
- * bkey code will work on keys in any packed format, the in-memory
- * representation of an unpacked key also is just one type of packed key...
- *
- * So that constrains the key part of a bkig endian bkey to start right
- * after the header.
- *
- * If we ever do a bkey_v2 and need to expand the hedaer by another byte for
- * some reason - that will clean up this wart.
- */
-__aligned(8)
-#endif
-;
-
-struct bkey_packed {
-	__u64		_data[0];
-
-	/* Size of combined key and value, in u64s */
-	__u8		u64s;
-
-	/* Format of key (0 for format local to btree node) */
-
-	/*
-	 * XXX: next incompat on disk format change, switch format and
-	 * needs_whiteout - bkey_packed() will be cheaper if format is the high
-	 * bits of the bitfield
-	 */
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u8		format:7,
-			needs_whiteout:1;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u8		needs_whiteout:1,
-			format:7;
-#endif
-
-	/* Type of the value */
-	__u8		type;
-	__u8		key_start[0];
-
-	/*
-	 * We copy bkeys with struct assignment in various places, and while
-	 * that shouldn't be done with packed bkeys we can't disallow it in C,
-	 * and it's legal to cast a bkey to a bkey_packed  - so padding it out
-	 * to the same size as struct bkey should hopefully be safest.
-	 */
-	__u8		pad[sizeof(struct bkey) - 3];
-} __packed __aligned(8);
-
-typedef struct {
-	__le64			lo;
-	__le64			hi;
-} bch_le128;
-
-#define BKEY_U64s			(sizeof(struct bkey) / sizeof(__u64))
-#define BKEY_U64s_MAX			U8_MAX
-#define BKEY_VAL_U64s_MAX		(BKEY_U64s_MAX - BKEY_U64s)
-
-#define KEY_PACKED_BITS_START		24
-
-#define KEY_FORMAT_LOCAL_BTREE		0
-#define KEY_FORMAT_CURRENT		1
-
-enum bch_bkey_fields {
-	BKEY_FIELD_INODE,
-	BKEY_FIELD_OFFSET,
-	BKEY_FIELD_SNAPSHOT,
-	BKEY_FIELD_SIZE,
-	BKEY_FIELD_VERSION_HI,
-	BKEY_FIELD_VERSION_LO,
-	BKEY_NR_FIELDS,
-};
-
-#define bkey_format_field(name, field)					\
-	[BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8)
-
-#define BKEY_FORMAT_CURRENT						\
-((struct bkey_format) {							\
-	.key_u64s	= BKEY_U64s,					\
-	.nr_fields	= BKEY_NR_FIELDS,				\
-	.bits_per_field = {						\
-		bkey_format_field(INODE,	p.inode),		\
-		bkey_format_field(OFFSET,	p.offset),		\
-		bkey_format_field(SNAPSHOT,	p.snapshot),		\
-		bkey_format_field(SIZE,		size),			\
-		bkey_format_field(VERSION_HI,	version.hi),		\
-		bkey_format_field(VERSION_LO,	version.lo),		\
-	},								\
-})
-
-/* bkey with inline value */
-struct bkey_i {
-	__u64			_data[0];
-
-	struct bkey	k;
-	struct bch_val	v;
-};
-
-#define POS_KEY(_pos)							\
-((struct bkey) {							\
-	.u64s		= BKEY_U64s,					\
-	.format		= KEY_FORMAT_CURRENT,				\
-	.p		= _pos,						\
-})
-
-#define KEY(_inode, _offset, _size)					\
-((struct bkey) {							\
-	.u64s		= BKEY_U64s,					\
-	.format		= KEY_FORMAT_CURRENT,				\
-	.p		= POS(_inode, _offset),				\
-	.size		= _size,					\
-})
-
-static inline void bkey_init(struct bkey *k)
-{
-	*k = KEY(0, 0, 0);
-}
-
-#define bkey_bytes(_k)		((_k)->u64s * sizeof(__u64))
-
-#define __BKEY_PADDED(key, pad)					\
-	struct bkey_i key; __u64 key ## _pad[pad]
-
-/*
- * - DELETED keys are used internally to mark keys that should be ignored but
- *   override keys in composition order.  Their version number is ignored.
- *
- * - DISCARDED keys indicate that the data is all 0s because it has been
- *   discarded. DISCARDs may have a version; if the version is nonzero the key
- *   will be persistent, otherwise the key will be dropped whenever the btree
- *   node is rewritten (like DELETED keys).
- *
- * - ERROR: any read of the data returns a read error, as the data was lost due
- *   to a failing device. Like DISCARDED keys, they can be removed (overridden)
- *   by new writes or cluster-wide GC. Node repair can also overwrite them with
- *   the same or a more recent version number, but not with an older version
- *   number.
- *
- * - WHITEOUT: for hash table btrees
- */
-#define BCH_BKEY_TYPES()				\
-	x(deleted,		0)			\
-	x(whiteout,		1)			\
-	x(error,		2)			\
-	x(cookie,		3)			\
-	x(hash_whiteout,	4)			\
-	x(btree_ptr,		5)			\
-	x(extent,		6)			\
-	x(reservation,		7)			\
-	x(inode,		8)			\
-	x(inode_generation,	9)			\
-	x(dirent,		10)			\
-	x(xattr,		11)			\
-	x(alloc,		12)			\
-	x(quota,		13)			\
-	x(stripe,		14)			\
-	x(reflink_p,		15)			\
-	x(reflink_v,		16)			\
-	x(inline_data,		17)			\
-	x(btree_ptr_v2,		18)			\
-	x(indirect_inline_data,	19)			\
-	x(alloc_v2,		20)			\
-	x(subvolume,		21)			\
-	x(snapshot,		22)			\
-	x(inode_v2,		23)			\
-	x(alloc_v3,		24)			\
-	x(set,			25)			\
-	x(lru,			26)			\
-	x(alloc_v4,		27)			\
-	x(backpointer,		28)			\
-	x(inode_v3,		29)			\
-	x(bucket_gens,		30)			\
-	x(snapshot_tree,	31)			\
-	x(logged_op_truncate,	32)			\
-	x(logged_op_finsert,	33)
-
-enum bch_bkey_type {
-#define x(name, nr) KEY_TYPE_##name	= nr,
-	BCH_BKEY_TYPES()
-#undef x
-	KEY_TYPE_MAX,
-};
-
-struct bch_deleted {
-	struct bch_val		v;
-};
-
-struct bch_whiteout {
-	struct bch_val		v;
-};
-
-struct bch_error {
-	struct bch_val		v;
-};
-
-struct bch_cookie {
-	struct bch_val		v;
-	__le64			cookie;
-};
-
-struct bch_hash_whiteout {
-	struct bch_val		v;
-};
-
-struct bch_set {
-	struct bch_val		v;
-};
-
-/* 128 bits, sufficient for cryptographic MACs: */
-struct bch_csum {
-	__le64			lo;
-	__le64			hi;
-} __packed __aligned(8);
-
-struct bch_backpointer {
-	struct bch_val		v;
-	__u8			btree_id;
-	__u8			level;
-	__u8			data_type;
-	__u64			bucket_offset:40;
-	__u32			bucket_len;
-	struct bpos		pos;
-} __packed __aligned(8);
-
-/* LRU btree: */
-
-struct bch_lru {
-	struct bch_val		v;
-	__le64			idx;
-} __packed __aligned(8);
-
-#define LRU_ID_STRIPES		(1U << 16)
-
-/* Optional/variable size superblock sections: */
-
-struct bch_sb_field {
-	__u64			_data[0];
-	__le32			u64s;
-	__le32			type;
-};
-
-#define BCH_SB_FIELDS()				\
-	x(journal,			0)	\
-	x(members_v1,			1)	\
-	x(crypt,			2)	\
-	x(replicas_v0,			3)	\
-	x(quota,			4)	\
-	x(disk_groups,			5)	\
-	x(clean,			6)	\
-	x(replicas,			7)	\
-	x(journal_seq_blacklist,	8)	\
-	x(journal_v2,			9)	\
-	x(counters,			10)	\
-	x(members_v2,			11)	\
-	x(errors,			12)	\
-	x(ext,				13)	\
-	x(downgrade,			14)
-
-#include "alloc_background_format.h"
-#include "extents_format.h"
-#include "reflink_format.h"
-#include "ec_format.h"
-#include "inode_format.h"
-#include "dirent_format.h"
-#include "xattr_format.h"
-#include "quota_format.h"
-#include "logged_ops_format.h"
-#include "snapshot_format.h"
-#include "subvolume_format.h"
-#include "sb-counters_format.h"
-
-enum bch_sb_field_type {
-#define x(f, nr)	BCH_SB_FIELD_##f = nr,
-	BCH_SB_FIELDS()
-#undef x
-	BCH_SB_FIELD_NR
-};
-
-/*
- * Most superblock fields are replicated in all device's superblocks - a few are
- * not:
- */
-#define BCH_SINGLE_DEVICE_SB_FIELDS		\
-	((1U << BCH_SB_FIELD_journal)|		\
-	 (1U << BCH_SB_FIELD_journal_v2))
-
-/* BCH_SB_FIELD_journal: */
-
-struct bch_sb_field_journal {
-	struct bch_sb_field	field;
-	__le64			buckets[];
-};
-
-struct bch_sb_field_journal_v2 {
-	struct bch_sb_field	field;
-
-	struct bch_sb_field_journal_v2_entry {
-		__le64		start;
-		__le64		nr;
-	}			d[];
-};
-
-/* BCH_SB_FIELD_members_v1: */
-
-#define BCH_MIN_NR_NBUCKETS	(1 << 6)
-
-#define BCH_IOPS_MEASUREMENTS()			\
-	x(seqread,	0)			\
-	x(seqwrite,	1)			\
-	x(randread,	2)			\
-	x(randwrite,	3)
-
-enum bch_iops_measurement {
-#define x(t, n) BCH_IOPS_##t = n,
-	BCH_IOPS_MEASUREMENTS()
-#undef x
-	BCH_IOPS_NR
-};
-
-#define BCH_MEMBER_ERROR_TYPES()		\
-	x(read,		0)			\
-	x(write,	1)			\
-	x(checksum,	2)
-
-enum bch_member_error_type {
-#define x(t, n) BCH_MEMBER_ERROR_##t = n,
-	BCH_MEMBER_ERROR_TYPES()
-#undef x
-	BCH_MEMBER_ERROR_NR
-};
-
-struct bch_member {
-	__uuid_t		uuid;
-	__le64			nbuckets;	/* device size */
-	__le16			first_bucket;   /* index of first bucket used */
-	__le16			bucket_size;	/* sectors */
-	__u8			btree_bitmap_shift;
-	__u8			pad[3];
-	__le64			last_mount;	/* time_t */
-
-	__le64			flags;
-	__le32			iops[4];
-	__le64			errors[BCH_MEMBER_ERROR_NR];
-	__le64			errors_at_reset[BCH_MEMBER_ERROR_NR];
-	__le64			errors_reset_time;
-	__le64			seq;
-	__le64			btree_allocated_bitmap;
-	/*
-	 * On recovery from a clean shutdown we don't normally read the journal,
-	 * but we still want to resume writing from where we left off so we
-	 * don't overwrite more than is necessary, for list journal debugging:
-	 */
-	__le32			last_journal_bucket;
-	__le32			last_journal_bucket_offset;
-};
-
-/*
- * This limit comes from the bucket_gens array - it's a single allocation, and
- * kernel allocation are limited to INT_MAX
- */
-#define BCH_MEMBER_NBUCKETS_MAX	(INT_MAX - 64)
-
-#define BCH_MEMBER_V1_BYTES	56
-
-LE64_BITMASK(BCH_MEMBER_STATE,		struct bch_member, flags,  0,  4)
-/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
-LE64_BITMASK(BCH_MEMBER_DISCARD,	struct bch_member, flags, 14, 15)
-LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,	struct bch_member, flags, 15, 20)
-LE64_BITMASK(BCH_MEMBER_GROUP,		struct bch_member, flags, 20, 28)
-LE64_BITMASK(BCH_MEMBER_DURABILITY,	struct bch_member, flags, 28, 30)
-LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
-					struct bch_member, flags, 30, 31)
-
-#if 0
-LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,	struct bch_member, flags[1], 0,  20);
-LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
-#endif
-
-#define BCH_MEMBER_STATES()			\
-	x(rw,		0)			\
-	x(ro,		1)			\
-	x(failed,	2)			\
-	x(spare,	3)
-
-enum bch_member_state {
-#define x(t, n) BCH_MEMBER_STATE_##t = n,
-	BCH_MEMBER_STATES()
-#undef x
-	BCH_MEMBER_STATE_NR
-};
-
-struct bch_sb_field_members_v1 {
-	struct bch_sb_field	field;
-	struct bch_member	_members[]; //Members are now variable size
-};
-
-struct bch_sb_field_members_v2 {
-	struct bch_sb_field	field;
-	__le16			member_bytes; //size of single member entry
-	u8			pad[6];
-	struct bch_member	_members[];
-};
-
-/* BCH_SB_FIELD_crypt: */
-
-struct nonce {
-	__le32			d[4];
-};
-
-struct bch_key {
-	__le64			key[4];
-};
-
-#define BCH_KEY_MAGIC					\
-	(((__u64) 'b' <<  0)|((__u64) 'c' <<  8)|		\
-	 ((__u64) 'h' << 16)|((__u64) '*' << 24)|		\
-	 ((__u64) '*' << 32)|((__u64) 'k' << 40)|		\
-	 ((__u64) 'e' << 48)|((__u64) 'y' << 56))
-
-struct bch_encrypted_key {
-	__le64			magic;
-	struct bch_key		key;
-};
-
-/*
- * If this field is present in the superblock, it stores an encryption key which
- * is used encrypt all other data/metadata. The key will normally be encrypted
- * with the key userspace provides, but if encryption has been turned off we'll
- * just store the master key unencrypted in the superblock so we can access the
- * previously encrypted data.
- */
-struct bch_sb_field_crypt {
-	struct bch_sb_field	field;
-
-	__le64			flags;
-	__le64			kdf_flags;
-	struct bch_encrypted_key key;
-};
-
-LE64_BITMASK(BCH_CRYPT_KDF_TYPE,	struct bch_sb_field_crypt, flags, 0, 4);
-
-enum bch_kdf_types {
-	BCH_KDF_SCRYPT		= 0,
-	BCH_KDF_NR		= 1,
-};
-
-/* stored as base 2 log of scrypt params: */
-LE64_BITMASK(BCH_KDF_SCRYPT_N,	struct bch_sb_field_crypt, kdf_flags,  0, 16);
-LE64_BITMASK(BCH_KDF_SCRYPT_R,	struct bch_sb_field_crypt, kdf_flags, 16, 32);
-LE64_BITMASK(BCH_KDF_SCRYPT_P,	struct bch_sb_field_crypt, kdf_flags, 32, 48);
-
-/* BCH_SB_FIELD_replicas: */
-
-#define BCH_DATA_TYPES()		\
-	x(free,		0)		\
-	x(sb,		1)		\
-	x(journal,	2)		\
-	x(btree,	3)		\
-	x(user,		4)		\
-	x(cached,	5)		\
-	x(parity,	6)		\
-	x(stripe,	7)		\
-	x(need_gc_gens,	8)		\
-	x(need_discard,	9)
-
-enum bch_data_type {
-#define x(t, n) BCH_DATA_##t,
-	BCH_DATA_TYPES()
-#undef x
-	BCH_DATA_NR
-};
-
-static inline bool data_type_is_empty(enum bch_data_type type)
-{
-	switch (type) {
-	case BCH_DATA_free:
-	case BCH_DATA_need_gc_gens:
-	case BCH_DATA_need_discard:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool data_type_is_hidden(enum bch_data_type type)
-{
-	switch (type) {
-	case BCH_DATA_sb:
-	case BCH_DATA_journal:
-		return true;
-	default:
-		return false;
-	}
-}
-
-struct bch_replicas_entry_v0 {
-	__u8			data_type;
-	__u8			nr_devs;
-	__u8			devs[];
-} __packed;
-
-struct bch_sb_field_replicas_v0 {
-	struct bch_sb_field	field;
-	struct bch_replicas_entry_v0 entries[];
-} __packed __aligned(8);
-
-struct bch_replicas_entry_v1 {
-	__u8			data_type;
-	__u8			nr_devs;
-	__u8			nr_required;
-	__u8			devs[];
-} __packed;
-
-#define replicas_entry_bytes(_i)					\
-	(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
-
-struct bch_sb_field_replicas {
-	struct bch_sb_field	field;
-	struct bch_replicas_entry_v1 entries[];
-} __packed __aligned(8);
-
-/* BCH_SB_FIELD_disk_groups: */
-
-#define BCH_SB_LABEL_SIZE		32
-
-struct bch_disk_group {
-	__u8			label[BCH_SB_LABEL_SIZE];
-	__le64			flags[2];
-} __packed __aligned(8);
-
-LE64_BITMASK(BCH_GROUP_DELETED,		struct bch_disk_group, flags[0], 0,  1)
-LE64_BITMASK(BCH_GROUP_DATA_ALLOWED,	struct bch_disk_group, flags[0], 1,  6)
-LE64_BITMASK(BCH_GROUP_PARENT,		struct bch_disk_group, flags[0], 6, 24)
-
-struct bch_sb_field_disk_groups {
-	struct bch_sb_field	field;
-	struct bch_disk_group	entries[];
-} __packed __aligned(8);
-
-/*
- * On clean shutdown, store btree roots and current journal sequence number in
- * the superblock:
- */
-struct jset_entry {
-	__le16			u64s;
-	__u8			btree_id;
-	__u8			level;
-	__u8			type; /* designates what this jset holds */
-	__u8			pad[3];
-
-	struct bkey_i		start[0];
-	__u64			_data[];
-};
-
-struct bch_sb_field_clean {
-	struct bch_sb_field	field;
-
-	__le32			flags;
-	__le16			_read_clock; /* no longer used */
-	__le16			_write_clock;
-	__le64			journal_seq;
-
-	struct jset_entry	start[0];
-	__u64			_data[];
-};
-
-struct journal_seq_blacklist_entry {
-	__le64			start;
-	__le64			end;
-};
-
-struct bch_sb_field_journal_seq_blacklist {
-	struct bch_sb_field	field;
-	struct journal_seq_blacklist_entry start[];
-};
-
-struct bch_sb_field_errors {
-	struct bch_sb_field	field;
-	struct bch_sb_field_error_entry {
-		__le64		v;
-		__le64		last_error_time;
-	}			entries[];
-};
-
-LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID,	struct bch_sb_field_error_entry, v,  0, 16);
-LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR,	struct bch_sb_field_error_entry, v, 16, 64);
-
-struct bch_sb_field_ext {
-	struct bch_sb_field	field;
-	__le64			recovery_passes_required[2];
-	__le64			errors_silent[8];
-	__le64			btrees_lost_data;
-};
-
-struct bch_sb_field_downgrade_entry {
-	__le16			version;
-	__le64			recovery_passes[2];
-	__le16			nr_errors;
-	__le16			errors[] __counted_by(nr_errors);
-} __packed __aligned(2);
-
-struct bch_sb_field_downgrade {
-	struct bch_sb_field	field;
-	struct bch_sb_field_downgrade_entry entries[];
-};
-
-/* Superblock: */
-
-/*
- * New versioning scheme:
- * One common version number for all on disk data structures - superblock, btree
- * nodes, journal entries
- */
-#define BCH_VERSION_MAJOR(_v)		((__u16) ((_v) >> 10))
-#define BCH_VERSION_MINOR(_v)		((__u16) ((_v) & ~(~0U << 10)))
-#define BCH_VERSION(_major, _minor)	(((_major) << 10)|(_minor) << 0)
-
-/*
- * field 1:		version name
- * field 2:		BCH_VERSION(major, minor)
- * field 3:		recovery passess required on upgrade
- */
-#define BCH_METADATA_VERSIONS()						\
-	x(bkey_renumber,		BCH_VERSION(0, 10))		\
-	x(inode_btree_change,		BCH_VERSION(0, 11))		\
-	x(snapshot,			BCH_VERSION(0, 12))		\
-	x(inode_backpointers,		BCH_VERSION(0, 13))		\
-	x(btree_ptr_sectors_written,	BCH_VERSION(0, 14))		\
-	x(snapshot_2,			BCH_VERSION(0, 15))		\
-	x(reflink_p_fix,		BCH_VERSION(0, 16))		\
-	x(subvol_dirent,		BCH_VERSION(0, 17))		\
-	x(inode_v2,			BCH_VERSION(0, 18))		\
-	x(freespace,			BCH_VERSION(0, 19))		\
-	x(alloc_v4,			BCH_VERSION(0, 20))		\
-	x(new_data_types,		BCH_VERSION(0, 21))		\
-	x(backpointers,			BCH_VERSION(0, 22))		\
-	x(inode_v3,			BCH_VERSION(0, 23))		\
-	x(unwritten_extents,		BCH_VERSION(0, 24))		\
-	x(bucket_gens,			BCH_VERSION(0, 25))		\
-	x(lru_v2,			BCH_VERSION(0, 26))		\
-	x(fragmentation_lru,		BCH_VERSION(0, 27))		\
-	x(no_bps_in_alloc_keys,		BCH_VERSION(0, 28))		\
-	x(snapshot_trees,		BCH_VERSION(0, 29))		\
-	x(major_minor,			BCH_VERSION(1,  0))		\
-	x(snapshot_skiplists,		BCH_VERSION(1,  1))		\
-	x(deleted_inodes,		BCH_VERSION(1,  2))		\
-	x(rebalance_work,		BCH_VERSION(1,  3))		\
-	x(member_seq,			BCH_VERSION(1,  4))		\
-	x(subvolume_fs_parent,		BCH_VERSION(1,  5))		\
-	x(btree_subvolume_children,	BCH_VERSION(1,  6))		\
-	x(mi_btree_bitmap,		BCH_VERSION(1,  7))
-
-enum bcachefs_metadata_version {
-	bcachefs_metadata_version_min = 9,
-#define x(t, n)	bcachefs_metadata_version_##t = n,
-	BCH_METADATA_VERSIONS()
-#undef x
-	bcachefs_metadata_version_max
-};
-
-static const __maybe_unused
-unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work;
-
-#define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
-
-#define BCH_SB_SECTOR			8
-#define BCH_SB_MEMBERS_MAX		64 /* XXX kill */
-
-#define BCH_SB_LAYOUT_SIZE_BITS_MAX	16 /* 32 MB */
-
-struct bch_sb_layout {
-	__uuid_t		magic;	/* bcachefs superblock UUID */
-	__u8			layout_type;
-	__u8			sb_max_size_bits; /* base 2 of 512 byte sectors */
-	__u8			nr_superblocks;
-	__u8			pad[5];
-	__le64			sb_offset[61];
-} __packed __aligned(8);
-
-#define BCH_SB_LAYOUT_SECTOR	7
-
-/*
- * @offset	- sector where this sb was written
- * @version	- on disk format version
- * @version_min	- Oldest metadata version this filesystem contains; so we can
- *		  safely drop compatibility code and refuse to mount filesystems
- *		  we'd need it for
- * @magic	- identifies as a bcachefs superblock (BCHFS_MAGIC)
- * @seq		- incremented each time superblock is written
- * @uuid	- used for generating various magic numbers and identifying
- *                member devices, never changes
- * @user_uuid	- user visible UUID, may be changed
- * @label	- filesystem label
- * @seq		- identifies most recent superblock, incremented each time
- *		  superblock is written
- * @features	- enabled incompatible features
- */
-struct bch_sb {
-	struct bch_csum		csum;
-	__le16			version;
-	__le16			version_min;
-	__le16			pad[2];
-	__uuid_t		magic;
-	__uuid_t		uuid;
-	__uuid_t		user_uuid;
-	__u8			label[BCH_SB_LABEL_SIZE];
-	__le64			offset;
-	__le64			seq;
-
-	__le16			block_size;
-	__u8			dev_idx;
-	__u8			nr_devices;
-	__le32			u64s;
-
-	__le64			time_base_lo;
-	__le32			time_base_hi;
-	__le32			time_precision;
-
-	__le64			flags[7];
-	__le64			write_time;
-	__le64			features[2];
-	__le64			compat[2];
-
-	struct bch_sb_layout	layout;
-
-	struct bch_sb_field	start[0];
-	__le64			_data[];
-} __packed __aligned(8);
-
-/*
- * Flags:
- * BCH_SB_INITALIZED	- set on first mount
- * BCH_SB_CLEAN		- did we shut down cleanly? Just a hint, doesn't affect
- *			  behaviour of mount/recovery path:
- * BCH_SB_INODE_32BIT	- limit inode numbers to 32 bits
- * BCH_SB_128_BIT_MACS	- 128 bit macs instead of 80
- * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides
- *			   DATA/META_CSUM_TYPE. Also indicates encryption
- *			   algorithm in use, if/when we get more than one
- */
-
-LE16_BITMASK(BCH_SB_BLOCK_SIZE,		struct bch_sb, block_size, 0, 16);
-
-LE64_BITMASK(BCH_SB_INITIALIZED,	struct bch_sb, flags[0],  0,  1);
-LE64_BITMASK(BCH_SB_CLEAN,		struct bch_sb, flags[0],  1,  2);
-LE64_BITMASK(BCH_SB_CSUM_TYPE,		struct bch_sb, flags[0],  2,  8);
-LE64_BITMASK(BCH_SB_ERROR_ACTION,	struct bch_sb, flags[0],  8, 12);
-
-LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE,	struct bch_sb, flags[0], 12, 28);
-
-LE64_BITMASK(BCH_SB_GC_RESERVE,		struct bch_sb, flags[0], 28, 33);
-LE64_BITMASK(BCH_SB_ROOT_RESERVE,	struct bch_sb, flags[0], 33, 40);
-
-LE64_BITMASK(BCH_SB_META_CSUM_TYPE,	struct bch_sb, flags[0], 40, 44);
-LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE,	struct bch_sb, flags[0], 44, 48);
-
-LE64_BITMASK(BCH_SB_META_REPLICAS_WANT,	struct bch_sb, flags[0], 48, 52);
-LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT,	struct bch_sb, flags[0], 52, 56);
-
-LE64_BITMASK(BCH_SB_POSIX_ACL,		struct bch_sb, flags[0], 56, 57);
-LE64_BITMASK(BCH_SB_USRQUOTA,		struct bch_sb, flags[0], 57, 58);
-LE64_BITMASK(BCH_SB_GRPQUOTA,		struct bch_sb, flags[0], 58, 59);
-LE64_BITMASK(BCH_SB_PRJQUOTA,		struct bch_sb, flags[0], 59, 60);
-
-LE64_BITMASK(BCH_SB_HAS_ERRORS,		struct bch_sb, flags[0], 60, 61);
-LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
-
-LE64_BITMASK(BCH_SB_BIG_ENDIAN,		struct bch_sb, flags[0], 62, 63);
-
-LE64_BITMASK(BCH_SB_STR_HASH_TYPE,	struct bch_sb, flags[1],  0,  4);
-LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1],  4,  8);
-LE64_BITMASK(BCH_SB_INODE_32BIT,	struct bch_sb, flags[1],  8,  9);
-
-LE64_BITMASK(BCH_SB_128_BIT_MACS,	struct bch_sb, flags[1],  9, 10);
-LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE,	struct bch_sb, flags[1], 10, 14);
-
-/*
- * Max size of an extent that may require bouncing to read or write
- * (checksummed, compressed): 64k
- */
-LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,
-					struct bch_sb, flags[1], 14, 20);
-
-LE64_BITMASK(BCH_SB_META_REPLICAS_REQ,	struct bch_sb, flags[1], 20, 24);
-LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ,	struct bch_sb, flags[1], 24, 28);
-
-LE64_BITMASK(BCH_SB_PROMOTE_TARGET,	struct bch_sb, flags[1], 28, 40);
-LE64_BITMASK(BCH_SB_FOREGROUND_TARGET,	struct bch_sb, flags[1], 40, 52);
-LE64_BITMASK(BCH_SB_BACKGROUND_TARGET,	struct bch_sb, flags[1], 52, 64);
-
-LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO,
-					struct bch_sb, flags[2],  0,  4);
-LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,	struct bch_sb, flags[2],  4, 64);
-
-LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
-LE64_BITMASK(BCH_SB_METADATA_TARGET,	struct bch_sb, flags[3], 16, 28);
-LE64_BITMASK(BCH_SB_SHARD_INUMS,	struct bch_sb, flags[3], 28, 29);
-LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
-LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
-LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
-LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
-LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
-LE64_BITMASK(BCH_SB_NOCOW,		struct bch_sb, flags[4], 33, 34);
-LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE,	struct bch_sb, flags[4], 34, 54);
-LE64_BITMASK(BCH_SB_VERSION_UPGRADE,	struct bch_sb, flags[4], 54, 56);
-
-LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_HI,struct bch_sb, flags[4], 56, 60);
-LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI,
-					struct bch_sb, flags[4], 60, 64);
-
-LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE,
-					struct bch_sb, flags[5],  0, 16);
-
-static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
-{
-	return BCH_SB_COMPRESSION_TYPE_LO(sb) | (BCH_SB_COMPRESSION_TYPE_HI(sb) << 4);
-}
-
-static inline void SET_BCH_SB_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
-{
-	SET_BCH_SB_COMPRESSION_TYPE_LO(sb, v);
-	SET_BCH_SB_COMPRESSION_TYPE_HI(sb, v >> 4);
-}
-
-static inline __u64 BCH_SB_BACKGROUND_COMPRESSION_TYPE(const struct bch_sb *sb)
-{
-	return BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb) |
-		(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb) << 4);
-}
-
-static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
-{
-	SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb, v);
-	SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb, v >> 4);
-}
-
-/*
- * Features:
- *
- * journal_seq_blacklist_v3:	gates BCH_SB_FIELD_journal_seq_blacklist
- * reflink:			gates KEY_TYPE_reflink
- * inline_data:			gates KEY_TYPE_inline_data
- * new_siphash:			gates BCH_STR_HASH_siphash
- * new_extent_overwrite:	gates BTREE_NODE_NEW_EXTENT_OVERWRITE
- */
-#define BCH_SB_FEATURES()			\
-	x(lz4,				0)	\
-	x(gzip,				1)	\
-	x(zstd,				2)	\
-	x(atomic_nlink,			3)	\
-	x(ec,				4)	\
-	x(journal_seq_blacklist_v3,	5)	\
-	x(reflink,			6)	\
-	x(new_siphash,			7)	\
-	x(inline_data,			8)	\
-	x(new_extent_overwrite,		9)	\
-	x(incompressible,		10)	\
-	x(btree_ptr_v2,			11)	\
-	x(extents_above_btree_updates,	12)	\
-	x(btree_updates_journalled,	13)	\
-	x(reflink_inline_data,		14)	\
-	x(new_varint,			15)	\
-	x(journal_no_flush,		16)	\
-	x(alloc_v2,			17)	\
-	x(extents_across_btree_nodes,	18)
-
-#define BCH_SB_FEATURES_ALWAYS				\
-	((1ULL << BCH_FEATURE_new_extent_overwrite)|	\
-	 (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
-	 (1ULL << BCH_FEATURE_btree_updates_journalled)|\
-	 (1ULL << BCH_FEATURE_alloc_v2)|\
-	 (1ULL << BCH_FEATURE_extents_across_btree_nodes))
-
-#define BCH_SB_FEATURES_ALL				\
-	(BCH_SB_FEATURES_ALWAYS|			\
-	 (1ULL << BCH_FEATURE_new_siphash)|		\
-	 (1ULL << BCH_FEATURE_btree_ptr_v2)|		\
-	 (1ULL << BCH_FEATURE_new_varint)|		\
-	 (1ULL << BCH_FEATURE_journal_no_flush))
-
-enum bch_sb_feature {
-#define x(f, n) BCH_FEATURE_##f,
-	BCH_SB_FEATURES()
-#undef x
-	BCH_FEATURE_NR,
-};
-
-#define BCH_SB_COMPAT()					\
-	x(alloc_info,				0)	\
-	x(alloc_metadata,			1)	\
-	x(extents_above_btree_updates_done,	2)	\
-	x(bformat_overflow_done,		3)
-
-enum bch_sb_compat {
-#define x(f, n) BCH_COMPAT_##f,
-	BCH_SB_COMPAT()
-#undef x
-	BCH_COMPAT_NR,
-};
-
-/* options: */
-
-#define BCH_VERSION_UPGRADE_OPTS()	\
-	x(compatible,		0)	\
-	x(incompatible,		1)	\
-	x(none,			2)
-
-enum bch_version_upgrade_opts {
-#define x(t, n) BCH_VERSION_UPGRADE_##t = n,
-	BCH_VERSION_UPGRADE_OPTS()
-#undef x
-};
-
-#define BCH_REPLICAS_MAX		4U
-
-#define BCH_BKEY_PTRS_MAX		16U
-
-#define BCH_ERROR_ACTIONS()		\
-	x(continue,		0)	\
-	x(ro,			1)	\
-	x(panic,		2)
-
-enum bch_error_actions {
-#define x(t, n) BCH_ON_ERROR_##t = n,
-	BCH_ERROR_ACTIONS()
-#undef x
-	BCH_ON_ERROR_NR
-};
-
-#define BCH_STR_HASH_TYPES()		\
-	x(crc32c,		0)	\
-	x(crc64,		1)	\
-	x(siphash_old,		2)	\
-	x(siphash,		3)
-
-enum bch_str_hash_type {
-#define x(t, n) BCH_STR_HASH_##t = n,
-	BCH_STR_HASH_TYPES()
-#undef x
-	BCH_STR_HASH_NR
-};
-
-#define BCH_STR_HASH_OPTS()		\
-	x(crc32c,		0)	\
-	x(crc64,		1)	\
-	x(siphash,		2)
-
-enum bch_str_hash_opts {
-#define x(t, n) BCH_STR_HASH_OPT_##t = n,
-	BCH_STR_HASH_OPTS()
-#undef x
-	BCH_STR_HASH_OPT_NR
-};
-
-#define BCH_CSUM_TYPES()			\
-	x(none,				0)	\
-	x(crc32c_nonzero,		1)	\
-	x(crc64_nonzero,		2)	\
-	x(chacha20_poly1305_80,		3)	\
-	x(chacha20_poly1305_128,	4)	\
-	x(crc32c,			5)	\
-	x(crc64,			6)	\
-	x(xxhash,			7)
-
-enum bch_csum_type {
-#define x(t, n) BCH_CSUM_##t = n,
-	BCH_CSUM_TYPES()
-#undef x
-	BCH_CSUM_NR
-};
-
-static const __maybe_unused unsigned bch_crc_bytes[] = {
-	[BCH_CSUM_none]				= 0,
-	[BCH_CSUM_crc32c_nonzero]		= 4,
-	[BCH_CSUM_crc32c]			= 4,
-	[BCH_CSUM_crc64_nonzero]		= 8,
-	[BCH_CSUM_crc64]			= 8,
-	[BCH_CSUM_xxhash]			= 8,
-	[BCH_CSUM_chacha20_poly1305_80]		= 10,
-	[BCH_CSUM_chacha20_poly1305_128]	= 16,
-};
-
-static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
-{
-	switch (type) {
-	case BCH_CSUM_chacha20_poly1305_80:
-	case BCH_CSUM_chacha20_poly1305_128:
-		return true;
-	default:
-		return false;
-	}
-}
-
-#define BCH_CSUM_OPTS()			\
-	x(none,			0)	\
-	x(crc32c,		1)	\
-	x(crc64,		2)	\
-	x(xxhash,		3)
-
-enum bch_csum_opts {
-#define x(t, n) BCH_CSUM_OPT_##t = n,
-	BCH_CSUM_OPTS()
-#undef x
-	BCH_CSUM_OPT_NR
-};
-
-#define BCH_COMPRESSION_TYPES()		\
-	x(none,			0)	\
-	x(lz4_old,		1)	\
-	x(gzip,			2)	\
-	x(lz4,			3)	\
-	x(zstd,			4)	\
-	x(incompressible,	5)
-
-enum bch_compression_type {
-#define x(t, n) BCH_COMPRESSION_TYPE_##t = n,
-	BCH_COMPRESSION_TYPES()
-#undef x
-	BCH_COMPRESSION_TYPE_NR
-};
-
-#define BCH_COMPRESSION_OPTS()		\
-	x(none,		0)		\
-	x(lz4,		1)		\
-	x(gzip,		2)		\
-	x(zstd,		3)
-
-enum bch_compression_opts {
-#define x(t, n) BCH_COMPRESSION_OPT_##t = n,
-	BCH_COMPRESSION_OPTS()
-#undef x
-	BCH_COMPRESSION_OPT_NR
-};
-
-/*
- * Magic numbers
- *
- * The various other data structures have their own magic numbers, which are
- * xored with the first part of the cache set's UUID
- */
-
-#define BCACHE_MAGIC							\
-	UUID_INIT(0xc68573f6, 0x4e1a, 0x45ca,				\
-		  0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
-#define BCHFS_MAGIC							\
-	UUID_INIT(0xc68573f6, 0x66ce, 0x90a9,				\
-		  0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
-
-#define BCACHEFS_STATFS_MAGIC		BCACHEFS_SUPER_MAGIC
-
-#define JSET_MAGIC		__cpu_to_le64(0x245235c1a3625032ULL)
-#define BSET_MAGIC		__cpu_to_le64(0x90135c78b99e07f5ULL)
-
-static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
-{
-	__le64 ret;
-
-	memcpy(&ret, &sb->uuid, sizeof(ret));
-	return ret;
-}
-
-static inline __u64 __jset_magic(struct bch_sb *sb)
-{
-	return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC);
-}
-
-static inline __u64 __bset_magic(struct bch_sb *sb)
-{
-	return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC);
-}
-
-/* Journal */
-
-#define JSET_KEYS_U64s	(sizeof(struct jset_entry) / sizeof(__u64))
-
-#define BCH_JSET_ENTRY_TYPES()			\
-	x(btree_keys,		0)		\
-	x(btree_root,		1)		\
-	x(prio_ptrs,		2)		\
-	x(blacklist,		3)		\
-	x(blacklist_v2,		4)		\
-	x(usage,		5)		\
-	x(data_usage,		6)		\
-	x(clock,		7)		\
-	x(dev_usage,		8)		\
-	x(log,			9)		\
-	x(overwrite,		10)		\
-	x(write_buffer_keys,	11)		\
-	x(datetime,		12)
-
-enum bch_jset_entry_type {
-#define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
-	BCH_JSET_ENTRY_TYPES()
-#undef x
-	BCH_JSET_ENTRY_NR
-};
-
-static inline bool jset_entry_is_key(struct jset_entry *e)
-{
-	switch (e->type) {
-	case BCH_JSET_ENTRY_btree_keys:
-	case BCH_JSET_ENTRY_btree_root:
-	case BCH_JSET_ENTRY_overwrite:
-	case BCH_JSET_ENTRY_write_buffer_keys:
-		return true;
-	}
-
-	return false;
-}
-
-/*
- * Journal sequence numbers can be blacklisted: bsets record the max sequence
- * number of all the journal entries they contain updates for, so that on
- * recovery we can ignore those bsets that contain index updates newer that what
- * made it into the journal.
- *
- * This means that we can't reuse that journal_seq - we have to skip it, and
- * then record that we skipped it so that the next time we crash and recover we
- * don't think there was a missing journal entry.
- */
-struct jset_entry_blacklist {
-	struct jset_entry	entry;
-	__le64			seq;
-};
-
-struct jset_entry_blacklist_v2 {
-	struct jset_entry	entry;
-	__le64			start;
-	__le64			end;
-};
-
-#define BCH_FS_USAGE_TYPES()			\
-	x(reserved,		0)		\
-	x(inodes,		1)		\
-	x(key_version,		2)
-
-enum bch_fs_usage_type {
-#define x(f, nr)	BCH_FS_USAGE_##f	= nr,
-	BCH_FS_USAGE_TYPES()
-#undef x
-	BCH_FS_USAGE_NR
-};
-
-struct jset_entry_usage {
-	struct jset_entry	entry;
-	__le64			v;
-} __packed;
-
-struct jset_entry_data_usage {
-	struct jset_entry	entry;
-	__le64			v;
-	struct bch_replicas_entry_v1 r;
-} __packed;
-
-struct jset_entry_clock {
-	struct jset_entry	entry;
-	__u8			rw;
-	__u8			pad[7];
-	__le64			time;
-} __packed;
-
-struct jset_entry_dev_usage_type {
-	__le64			buckets;
-	__le64			sectors;
-	__le64			fragmented;
-} __packed;
-
-struct jset_entry_dev_usage {
-	struct jset_entry	entry;
-	__le32			dev;
-	__u32			pad;
-
-	__le64			_buckets_ec;		/* No longer used */
-	__le64			_buckets_unavailable;	/* No longer used */
-
-	struct jset_entry_dev_usage_type d[];
-};
-
-static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
-{
-	return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) /
-		sizeof(struct jset_entry_dev_usage_type);
-}
-
-struct jset_entry_log {
-	struct jset_entry	entry;
-	u8			d[];
-} __packed __aligned(8);
-
-struct jset_entry_datetime {
-	struct jset_entry	entry;
-	__le64			seconds;
-} __packed __aligned(8);
-
-/*
- * On disk format for a journal entry:
- * seq is monotonically increasing; every journal entry has its own unique
- * sequence number.
- *
- * last_seq is the oldest journal entry that still has keys the btree hasn't
- * flushed to disk yet.
- *
- * version is for on disk format changes.
- */
-struct jset {
-	struct bch_csum		csum;
-
-	__le64			magic;
-	__le64			seq;
-	__le32			version;
-	__le32			flags;
-
-	__le32			u64s; /* size of d[] in u64s */
-
-	__u8			encrypted_start[0];
-
-	__le16			_read_clock; /* no longer used */
-	__le16			_write_clock;
-
-	/* Sequence number of oldest dirty journal entry */
-	__le64			last_seq;
-
-
-	struct jset_entry	start[0];
-	__u64			_data[];
-} __packed __aligned(8);
-
-LE32_BITMASK(JSET_CSUM_TYPE,	struct jset, flags, 0, 4);
-LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
-LE32_BITMASK(JSET_NO_FLUSH,	struct jset, flags, 5, 6);
-
-#define BCH_JOURNAL_BUCKETS_MIN		8
-
-/* Btree: */
-
-enum btree_id_flags {
-	BTREE_ID_EXTENTS	= BIT(0),
-	BTREE_ID_SNAPSHOTS	= BIT(1),
-	BTREE_ID_SNAPSHOT_FIELD	= BIT(2),
-	BTREE_ID_DATA		= BIT(3),
-};
-
-#define BCH_BTREE_IDS()								\
-	x(extents,		0,	BTREE_ID_EXTENTS|BTREE_ID_SNAPSHOTS|BTREE_ID_DATA,\
-	  BIT_ULL(KEY_TYPE_whiteout)|						\
-	  BIT_ULL(KEY_TYPE_error)|						\
-	  BIT_ULL(KEY_TYPE_cookie)|						\
-	  BIT_ULL(KEY_TYPE_extent)|						\
-	  BIT_ULL(KEY_TYPE_reservation)|					\
-	  BIT_ULL(KEY_TYPE_reflink_p)|						\
-	  BIT_ULL(KEY_TYPE_inline_data))					\
-	x(inodes,		1,	BTREE_ID_SNAPSHOTS,			\
-	  BIT_ULL(KEY_TYPE_whiteout)|						\
-	  BIT_ULL(KEY_TYPE_inode)|						\
-	  BIT_ULL(KEY_TYPE_inode_v2)|						\
-	  BIT_ULL(KEY_TYPE_inode_v3)|						\
-	  BIT_ULL(KEY_TYPE_inode_generation))					\
-	x(dirents,		2,	BTREE_ID_SNAPSHOTS,			\
-	  BIT_ULL(KEY_TYPE_whiteout)|						\
-	  BIT_ULL(KEY_TYPE_hash_whiteout)|					\
-	  BIT_ULL(KEY_TYPE_dirent))						\
-	x(xattrs,		3,	BTREE_ID_SNAPSHOTS,			\
-	  BIT_ULL(KEY_TYPE_whiteout)|						\
-	  BIT_ULL(KEY_TYPE_cookie)|						\
-	  BIT_ULL(KEY_TYPE_hash_whiteout)|					\
-	  BIT_ULL(KEY_TYPE_xattr))						\
-	x(alloc,		4,	0,					\
-	  BIT_ULL(KEY_TYPE_alloc)|						\
-	  BIT_ULL(KEY_TYPE_alloc_v2)|						\
-	  BIT_ULL(KEY_TYPE_alloc_v3)|						\
-	  BIT_ULL(KEY_TYPE_alloc_v4))						\
-	x(quotas,		5,	0,					\
-	  BIT_ULL(KEY_TYPE_quota))						\
-	x(stripes,		6,	0,					\
-	  BIT_ULL(KEY_TYPE_stripe))						\
-	x(reflink,		7,	BTREE_ID_EXTENTS|BTREE_ID_DATA,		\
-	  BIT_ULL(KEY_TYPE_reflink_v)|						\
-	  BIT_ULL(KEY_TYPE_indirect_inline_data)|				\
-	  BIT_ULL(KEY_TYPE_error))						\
-	x(subvolumes,		8,	0,					\
-	  BIT_ULL(KEY_TYPE_subvolume))						\
-	x(snapshots,		9,	0,					\
-	  BIT_ULL(KEY_TYPE_snapshot))						\
-	x(lru,			10,	0,					\
-	  BIT_ULL(KEY_TYPE_set))						\
-	x(freespace,		11,	BTREE_ID_EXTENTS,			\
-	  BIT_ULL(KEY_TYPE_set))						\
-	x(need_discard,		12,	0,					\
-	  BIT_ULL(KEY_TYPE_set))						\
-	x(backpointers,		13,	0,					\
-	  BIT_ULL(KEY_TYPE_backpointer))					\
-	x(bucket_gens,		14,	0,					\
-	  BIT_ULL(KEY_TYPE_bucket_gens))					\
-	x(snapshot_trees,	15,	0,					\
-	  BIT_ULL(KEY_TYPE_snapshot_tree))					\
-	x(deleted_inodes,	16,	BTREE_ID_SNAPSHOT_FIELD,		\
-	  BIT_ULL(KEY_TYPE_set))						\
-	x(logged_ops,		17,	0,					\
-	  BIT_ULL(KEY_TYPE_logged_op_truncate)|					\
-	  BIT_ULL(KEY_TYPE_logged_op_finsert))					\
-	x(rebalance_work,	18,	BTREE_ID_SNAPSHOT_FIELD,		\
-	  BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))			\
-	x(subvolume_children,	19,	0,					\
-	  BIT_ULL(KEY_TYPE_set))
-
-enum btree_id {
-#define x(name, nr, ...) BTREE_ID_##name = nr,
-	BCH_BTREE_IDS()
-#undef x
-	BTREE_ID_NR
-};
-
-static inline bool btree_id_is_alloc(enum btree_id id)
-{
-	switch (id) {
-	case BTREE_ID_alloc:
-	case BTREE_ID_backpointers:
-	case BTREE_ID_need_discard:
-	case BTREE_ID_freespace:
-	case BTREE_ID_bucket_gens:
-		return true;
-	default:
-		return false;
-	}
-}
-
-#define BTREE_MAX_DEPTH		4U
-
-/* Btree nodes */
-
-/*
- * Btree nodes
- *
- * On disk a btree node is a list/log of these; within each set the keys are
- * sorted
- */
-struct bset {
-	__le64			seq;
-
-	/*
-	 * Highest journal entry this bset contains keys for.
-	 * If on recovery we don't see that journal entry, this bset is ignored:
-	 * this allows us to preserve the order of all index updates after a
-	 * crash, since the journal records a total order of all index updates
-	 * and anything that didn't make it to the journal doesn't get used.
-	 */
-	__le64			journal_seq;
-
-	__le32			flags;
-	__le16			version;
-	__le16			u64s; /* count of d[] in u64s */
-
-	struct bkey_packed	start[0];
-	__u64			_data[];
-} __packed __aligned(8);
-
-LE32_BITMASK(BSET_CSUM_TYPE,	struct bset, flags, 0, 4);
-
-LE32_BITMASK(BSET_BIG_ENDIAN,	struct bset, flags, 4, 5);
-LE32_BITMASK(BSET_SEPARATE_WHITEOUTS,
-				struct bset, flags, 5, 6);
-
-/* Sector offset within the btree node: */
-LE32_BITMASK(BSET_OFFSET,	struct bset, flags, 16, 32);
-
-struct btree_node {
-	struct bch_csum		csum;
-	__le64			magic;
-
-	/* this flags field is encrypted, unlike bset->flags: */
-	__le64			flags;
-
-	/* Closed interval: */
-	struct bpos		min_key;
-	struct bpos		max_key;
-	struct bch_extent_ptr	_ptr; /* not used anymore */
-	struct bkey_format	format;
-
-	union {
-	struct bset		keys;
-	struct {
-		__u8		pad[22];
-		__le16		u64s;
-		__u64		_data[0];
-
-	};
-	};
-} __packed __aligned(8);
-
-LE64_BITMASK(BTREE_NODE_ID_LO,	struct btree_node, flags,  0,  4);
-LE64_BITMASK(BTREE_NODE_LEVEL,	struct btree_node, flags,  4,  8);
-LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE,
-				struct btree_node, flags,  8,  9);
-LE64_BITMASK(BTREE_NODE_ID_HI,	struct btree_node, flags,  9, 25);
-/* 25-32 unused */
-LE64_BITMASK(BTREE_NODE_SEQ,	struct btree_node, flags, 32, 64);
-
-static inline __u64 BTREE_NODE_ID(struct btree_node *n)
-{
-	return BTREE_NODE_ID_LO(n) | (BTREE_NODE_ID_HI(n) << 4);
-}
-
-static inline void SET_BTREE_NODE_ID(struct btree_node *n, __u64 v)
-{
-	SET_BTREE_NODE_ID_LO(n, v);
-	SET_BTREE_NODE_ID_HI(n, v >> 4);
-}
-
-struct btree_node_entry {
-	struct bch_csum		csum;
-
-	union {
-	struct bset		keys;
-	struct {
-		__u8		pad[22];
-		__le16		u64s;
-		__u64		_data[0];
-	};
-	};
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_FORMAT_H */
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
deleted file mode 100644
index 4b8fba754b1c..000000000000
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ /dev/null
@@ -1,412 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IOCTL_H
-#define _BCACHEFS_IOCTL_H
-
-#include <linux/uuid.h>
-#include <asm/ioctl.h>
-#include "bcachefs_format.h"
-
-/*
- * Flags common to multiple ioctls:
- */
-#define BCH_FORCE_IF_DATA_LOST		(1 << 0)
-#define BCH_FORCE_IF_METADATA_LOST	(1 << 1)
-#define BCH_FORCE_IF_DATA_DEGRADED	(1 << 2)
-#define BCH_FORCE_IF_METADATA_DEGRADED	(1 << 3)
-
-#define BCH_FORCE_IF_LOST			\
-	(BCH_FORCE_IF_DATA_LOST|		\
-	 BCH_FORCE_IF_METADATA_LOST)
-#define BCH_FORCE_IF_DEGRADED			\
-	(BCH_FORCE_IF_DATA_DEGRADED|		\
-	 BCH_FORCE_IF_METADATA_DEGRADED)
-
-/*
- * If cleared, ioctl that refer to a device pass it as a pointer to a pathname
- * (e.g. /dev/sda1); if set, the dev field is the device's index within the
- * filesystem:
- */
-#define BCH_BY_INDEX			(1 << 4)
-
-/*
- * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem
- * wide superblock:
- */
-#define BCH_READ_DEV			(1 << 5)
-
-/* global control dev: */
-
-/* These are currently broken, and probably unnecessary: */
-#if 0
-#define BCH_IOCTL_ASSEMBLE	_IOW(0xbc, 1, struct bch_ioctl_assemble)
-#define BCH_IOCTL_INCREMENTAL	_IOW(0xbc, 2, struct bch_ioctl_incremental)
-
-struct bch_ioctl_assemble {
-	__u32			flags;
-	__u32			nr_devs;
-	__u64			pad;
-	__u64			devs[];
-};
-
-struct bch_ioctl_incremental {
-	__u32			flags;
-	__u64			pad;
-	__u64			dev;
-};
-#endif
-
-/* filesystem ioctls: */
-
-#define BCH_IOCTL_QUERY_UUID	_IOR(0xbc,	1,  struct bch_ioctl_query_uuid)
-
-/* These only make sense when we also have incremental assembly */
-#if 0
-#define BCH_IOCTL_START		_IOW(0xbc,	2,  struct bch_ioctl_start)
-#define BCH_IOCTL_STOP		_IO(0xbc,	3)
-#endif
-
-#define BCH_IOCTL_DISK_ADD	_IOW(0xbc,	4,  struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_REMOVE	_IOW(0xbc,	5,  struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_ONLINE	_IOW(0xbc,	6,  struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_OFFLINE	_IOW(0xbc,	7,  struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc,	8,  struct bch_ioctl_disk_set_state)
-#define BCH_IOCTL_DATA		_IOW(0xbc,	10, struct bch_ioctl_data)
-#define BCH_IOCTL_FS_USAGE	_IOWR(0xbc,	11, struct bch_ioctl_fs_usage)
-#define BCH_IOCTL_DEV_USAGE	_IOWR(0xbc,	11, struct bch_ioctl_dev_usage)
-#define BCH_IOCTL_READ_SUPER	_IOW(0xbc,	12, struct bch_ioctl_read_super)
-#define BCH_IOCTL_DISK_GET_IDX	_IOW(0xbc,	13,  struct bch_ioctl_disk_get_idx)
-#define BCH_IOCTL_DISK_RESIZE	_IOW(0xbc,	14,  struct bch_ioctl_disk_resize)
-#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15,  struct bch_ioctl_disk_resize_journal)
-
-#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc,	16,  struct bch_ioctl_subvolume)
-#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc,	17,  struct bch_ioctl_subvolume)
-
-#define BCH_IOCTL_DEV_USAGE_V2	_IOWR(0xbc,	18, struct bch_ioctl_dev_usage_v2)
-
-#define BCH_IOCTL_FSCK_OFFLINE	_IOW(0xbc,	19,  struct bch_ioctl_fsck_offline)
-#define BCH_IOCTL_FSCK_ONLINE	_IOW(0xbc,	20,  struct bch_ioctl_fsck_online)
-
-/* ioctl below act on a particular file, not the filesystem as a whole: */
-
-#define BCHFS_IOC_REINHERIT_ATTRS	_IOR(0xbc, 64, const char __user *)
-
-/*
- * BCH_IOCTL_QUERY_UUID: get filesystem UUID
- *
- * Returns user visible UUID, not internal UUID (which may not ever be changed);
- * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with
- * this UUID.
- */
-struct bch_ioctl_query_uuid {
-	__uuid_t		uuid;
-};
-
-#if 0
-struct bch_ioctl_start {
-	__u32			flags;
-	__u32			pad;
-};
-#endif
-
-/*
- * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
- *
- * The specified device must not be open or in use. On success, the new device
- * will be an online member of the filesystem just like any other member.
- *
- * The device must first be prepared by userspace by formatting with a bcachefs
- * superblock, which is only used for passing in superblock options/parameters
- * for that device (in struct bch_member). The new device's superblock should
- * not claim to be a member of any existing filesystem - UUIDs on it will be
- * ignored.
- */
-
-/*
- * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem
- *
- * Any data present on @dev will be permanently deleted, and @dev will be
- * removed from its slot in the filesystem's list of member devices. The device
- * may be either offline or offline.
- *
- * Will fail removing @dev would leave us with insufficient read write devices
- * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are
- * set.
- */
-
-/*
- * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem
- * but is not open (e.g. because we started in degraded mode), bring it online
- *
- * all existing data on @dev will be available once the device is online,
- * exactly as if @dev was present when the filesystem was first mounted
- */
-
-/*
- * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that
- * block device, without removing it from the filesystem (so it can be brought
- * back online later)
- *
- * Data present on @dev will be unavailable while @dev is offline (unless
- * replicated), but will still be intact and untouched if @dev is brought back
- * online
- *
- * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would
- * leave us with insufficient read write devices or degraded/unavailable data,
- * unless the approprate BCH_FORCE_IF_* flags are set.
- */
-
-struct bch_ioctl_disk {
-	__u32			flags;
-	__u32			pad;
-	__u64			dev;
-};
-
-/*
- * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
- *
- * @new_state		- one of the bch_member_state states (rw, ro, failed,
- *			  spare)
- *
- * Will refuse to change member state if we would then have insufficient devices
- * to write to, or if it would result in degraded data (when @new_state is
- * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set.
- */
-struct bch_ioctl_disk_set_state {
-	__u32			flags;
-	__u8			new_state;
-	__u8			pad[3];
-	__u64			dev;
-};
-
-#define BCH_DATA_OPS()			\
-	x(scrub,		0)	\
-	x(rereplicate,		1)	\
-	x(migrate,		2)	\
-	x(rewrite_old_nodes,	3)	\
-	x(drop_extra_replicas,	4)
-
-enum bch_data_ops {
-#define x(t, n) BCH_DATA_OP_##t = n,
-	BCH_DATA_OPS()
-#undef x
-	BCH_DATA_OP_NR
-};
-
-/*
- * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g.
- * scrub, rereplicate, migrate).
- *
- * This ioctl kicks off a job in the background, and returns a file descriptor.
- * Reading from the file descriptor returns a struct bch_ioctl_data_event,
- * indicating current progress, and closing the file descriptor will stop the
- * job. The file descriptor is O_CLOEXEC.
- */
-struct bch_ioctl_data {
-	__u16			op;
-	__u8			start_btree;
-	__u8			end_btree;
-	__u32			flags;
-
-	struct bpos		start_pos;
-	struct bpos		end_pos;
-
-	union {
-	struct {
-		__u32		dev;
-		__u32		pad;
-	}			migrate;
-	struct {
-		__u64		pad[8];
-	};
-	};
-} __packed __aligned(8);
-
-enum bch_data_event {
-	BCH_DATA_EVENT_PROGRESS	= 0,
-	/* XXX: add an event for reporting errors */
-	BCH_DATA_EVENT_NR	= 1,
-};
-
-struct bch_ioctl_data_progress {
-	__u8			data_type;
-	__u8			btree_id;
-	__u8			pad[2];
-	struct bpos		pos;
-
-	__u64			sectors_done;
-	__u64			sectors_total;
-} __packed __aligned(8);
-
-struct bch_ioctl_data_event {
-	__u8			type;
-	__u8			pad[7];
-	union {
-	struct bch_ioctl_data_progress p;
-	__u64			pad2[15];
-	};
-} __packed __aligned(8);
-
-struct bch_replicas_usage {
-	__u64			sectors;
-	struct bch_replicas_entry_v1 r;
-} __packed;
-
-static inline struct bch_replicas_usage *
-replicas_usage_next(struct bch_replicas_usage *u)
-{
-	return (void *) u + replicas_entry_bytes(&u->r) + 8;
-}
-
-/*
- * BCH_IOCTL_FS_USAGE: query filesystem disk space usage
- *
- * Returns disk space usage broken out by data type, number of replicas, and
- * by component device
- *
- * @replica_entries_bytes - size, in bytes, allocated for replica usage entries
- *
- * On success, @replica_entries_bytes will be changed to indicate the number of
- * bytes actually used.
- *
- * Returns -ERANGE if @replica_entries_bytes was too small
- */
-struct bch_ioctl_fs_usage {
-	__u64			capacity;
-	__u64			used;
-	__u64			online_reserved;
-	__u64			persistent_reserved[BCH_REPLICAS_MAX];
-
-	__u32			replica_entries_bytes;
-	__u32			pad;
-
-	struct bch_replicas_usage replicas[];
-};
-
-/*
- * BCH_IOCTL_DEV_USAGE: query device disk space usage
- *
- * Returns disk space usage broken out by data type - both by buckets and
- * sectors.
- */
-struct bch_ioctl_dev_usage {
-	__u64			dev;
-	__u32			flags;
-	__u8			state;
-	__u8			pad[7];
-
-	__u32			bucket_size;
-	__u64			nr_buckets;
-
-	__u64			buckets_ec;
-
-	struct bch_ioctl_dev_usage_type {
-		__u64		buckets;
-		__u64		sectors;
-		__u64		fragmented;
-	}			d[10];
-};
-
-struct bch_ioctl_dev_usage_v2 {
-	__u64			dev;
-	__u32			flags;
-	__u8			state;
-	__u8			nr_data_types;
-	__u8			pad[6];
-
-	__u32			bucket_size;
-	__u64			nr_buckets;
-
-	struct bch_ioctl_dev_usage_type d[];
-};
-
-/*
- * BCH_IOCTL_READ_SUPER: read filesystem superblock
- *
- * Equivalent to reading the superblock directly from the block device, except
- * avoids racing with the kernel writing the superblock or having to figure out
- * which block device to read
- *
- * @sb		- buffer to read into
- * @size	- size of userspace allocated buffer
- * @dev		- device to read superblock for, if BCH_READ_DEV flag is
- *		  specified
- *
- * Returns -ERANGE if buffer provided is too small
- */
-struct bch_ioctl_read_super {
-	__u32			flags;
-	__u32			pad;
-	__u64			dev;
-	__u64			size;
-	__u64			sb;
-};
-
-/*
- * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to
- * determine if disk is a (online) member - if so, returns device's index
- *
- * Returns -ENOENT if not found
- */
-struct bch_ioctl_disk_get_idx {
-	__u64			dev;
-};
-
-/*
- * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device
- *
- * @dev		- member to resize
- * @nbuckets	- new number of buckets
- */
-struct bch_ioctl_disk_resize {
-	__u32			flags;
-	__u32			pad;
-	__u64			dev;
-	__u64			nbuckets;
-};
-
-/*
- * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device
- *
- * @dev		- member to resize
- * @nbuckets	- new number of buckets
- */
-struct bch_ioctl_disk_resize_journal {
-	__u32			flags;
-	__u32			pad;
-	__u64			dev;
-	__u64			nbuckets;
-};
-
-struct bch_ioctl_subvolume {
-	__u32			flags;
-	__u32			dirfd;
-	__u16			mode;
-	__u16			pad[3];
-	__u64			dst_ptr;
-	__u64			src_ptr;
-};
-
-#define BCH_SUBVOL_SNAPSHOT_CREATE	(1U << 0)
-#define BCH_SUBVOL_SNAPSHOT_RO		(1U << 1)
-
-/*
- * BCH_IOCTL_FSCK_OFFLINE: run fsck from the 'bcachefs fsck' userspace command,
- * but with the kernel's implementation of fsck:
- */
-struct bch_ioctl_fsck_offline {
-	__u64			flags;
-	__u64			opts;		/* string */
-	__u64			nr_devs;
-	__u64			devs[] __counted_by(nr_devs);
-};
-
-/*
- * BCH_IOCTL_FSCK_ONLINE: run fsck from the 'bcachefs fsck' userspace command,
- * but with the kernel's implementation of fsck:
- */
-struct bch_ioctl_fsck_online {
-	__u64			flags;
-	__u64			opts;		/* string */
-};
-
-#endif /* _BCACHEFS_IOCTL_H */
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
deleted file mode 100644
index f46978e5cb7c..000000000000
--- a/fs/bcachefs/bkey.c
+++ /dev/null
@@ -1,1117 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey.h"
-#include "bkey_cmp.h"
-#include "bkey_methods.h"
-#include "bset.h"
-#include "util.h"
-
-const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
-
-void bch2_bkey_packed_to_binary_text(struct printbuf *out,
-				     const struct bkey_format *f,
-				     const struct bkey_packed *k)
-{
-	const u64 *p = high_word(f, k);
-	unsigned word_bits = 64 - high_bit_offset;
-	unsigned nr_key_bits = bkey_format_key_bits(f) + high_bit_offset;
-	u64 v = *p & (~0ULL >> high_bit_offset);
-
-	if (!nr_key_bits) {
-		prt_str(out, "(empty)");
-		return;
-	}
-
-	while (1) {
-		unsigned next_key_bits = nr_key_bits;
-
-		if (nr_key_bits < 64) {
-			v >>= 64 - nr_key_bits;
-			next_key_bits = 0;
-		} else {
-			next_key_bits -= 64;
-		}
-
-		bch2_prt_u64_base2_nbits(out, v, min(word_bits, nr_key_bits));
-
-		if (!next_key_bits)
-			break;
-
-		prt_char(out, ' ');
-
-		p = next_word(p);
-		v = *p;
-		word_bits = 64;
-		nr_key_bits = next_key_bits;
-	}
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
-				  const struct bkey *unpacked,
-				  const struct bkey_format *format)
-{
-	struct bkey tmp;
-
-	BUG_ON(bkeyp_val_u64s(format, packed) !=
-	       bkey_val_u64s(unpacked));
-
-	BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed));
-
-	tmp = __bch2_bkey_unpack_key(format, packed);
-
-	if (memcmp(&tmp, unpacked, sizeof(struct bkey))) {
-		struct printbuf buf = PRINTBUF;
-
-		prt_printf(&buf, "keys differ: format u64s %u fields %u %u %u %u %u\n",
-		      format->key_u64s,
-		      format->bits_per_field[0],
-		      format->bits_per_field[1],
-		      format->bits_per_field[2],
-		      format->bits_per_field[3],
-		      format->bits_per_field[4]);
-
-		prt_printf(&buf, "compiled unpack: ");
-		bch2_bkey_to_text(&buf, unpacked);
-		prt_newline(&buf);
-
-		prt_printf(&buf, "c unpack:        ");
-		bch2_bkey_to_text(&buf, &tmp);
-		prt_newline(&buf);
-
-		prt_printf(&buf, "compiled unpack: ");
-		bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
-						(struct bkey_packed *) unpacked);
-		prt_newline(&buf);
-
-		prt_printf(&buf, "c unpack:        ");
-		bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
-						(struct bkey_packed *) &tmp);
-		prt_newline(&buf);
-
-		panic("%s", buf.buf);
-	}
-}
-
-#else
-static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed,
-					const struct bkey *unpacked,
-					const struct bkey_format *format) {}
-#endif
-
-struct pack_state {
-	const struct bkey_format *format;
-	unsigned		bits;	/* bits remaining in current word */
-	u64			w;	/* current word */
-	u64			*p;	/* pointer to next word */
-};
-
-__always_inline
-static struct pack_state pack_state_init(const struct bkey_format *format,
-					 struct bkey_packed *k)
-{
-	u64 *p = high_word(format, k);
-
-	return (struct pack_state) {
-		.format	= format,
-		.bits	= 64 - high_bit_offset,
-		.w	= 0,
-		.p	= p,
-	};
-}
-
-__always_inline
-static void pack_state_finish(struct pack_state *state,
-			      struct bkey_packed *k)
-{
-	EBUG_ON(state->p <  k->_data);
-	EBUG_ON(state->p >= (u64 *) k->_data + state->format->key_u64s);
-
-	*state->p = state->w;
-}
-
-struct unpack_state {
-	const struct bkey_format *format;
-	unsigned		bits;	/* bits remaining in current word */
-	u64			w;	/* current word */
-	const u64		*p;	/* pointer to next word */
-};
-
-__always_inline
-static struct unpack_state unpack_state_init(const struct bkey_format *format,
-					     const struct bkey_packed *k)
-{
-	const u64 *p = high_word(format, k);
-
-	return (struct unpack_state) {
-		.format	= format,
-		.bits	= 64 - high_bit_offset,
-		.w	= *p << high_bit_offset,
-		.p	= p,
-	};
-}
-
-__always_inline
-static u64 get_inc_field(struct unpack_state *state, unsigned field)
-{
-	unsigned bits = state->format->bits_per_field[field];
-	u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]);
-
-	if (bits >= state->bits) {
-		v = state->w >> (64 - bits);
-		bits -= state->bits;
-
-		state->p = next_word(state->p);
-		state->w = *state->p;
-		state->bits = 64;
-	}
-
-	/* avoid shift by 64 if bits is 0 - bits is never 64 here: */
-	v |= (state->w >> 1) >> (63 - bits);
-	state->w <<= bits;
-	state->bits -= bits;
-
-	return v + offset;
-}
-
-__always_inline
-static void __set_inc_field(struct pack_state *state, unsigned field, u64 v)
-{
-	unsigned bits = state->format->bits_per_field[field];
-
-	if (bits) {
-		if (bits > state->bits) {
-			bits -= state->bits;
-			/* avoid shift by 64 if bits is 64 - bits is never 0 here: */
-			state->w |= (v >> 1) >> (bits - 1);
-
-			*state->p = state->w;
-			state->p = next_word(state->p);
-			state->w = 0;
-			state->bits = 64;
-		}
-
-		state->bits -= bits;
-		state->w |= v << state->bits;
-	}
-}
-
-__always_inline
-static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
-{
-	unsigned bits = state->format->bits_per_field[field];
-	u64 offset = le64_to_cpu(state->format->field_offset[field]);
-
-	if (v < offset)
-		return false;
-
-	v -= offset;
-
-	if (fls64(v) > bits)
-		return false;
-
-	__set_inc_field(state, field, v);
-	return true;
-}
-
-/*
- * Note: does NOT set out->format (we don't know what it should be here!)
- *
- * Also: doesn't work on extents - it doesn't preserve the invariant that
- * if k is packed bkey_start_pos(k) will successfully pack
- */
-static bool bch2_bkey_transform_key(const struct bkey_format *out_f,
-				   struct bkey_packed *out,
-				   const struct bkey_format *in_f,
-				   const struct bkey_packed *in)
-{
-	struct pack_state out_s = pack_state_init(out_f, out);
-	struct unpack_state in_s = unpack_state_init(in_f, in);
-	u64 *w = out->_data;
-	unsigned i;
-
-	*w = 0;
-
-	for (i = 0; i < BKEY_NR_FIELDS; i++)
-		if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i)))
-			return false;
-
-	/* Can't happen because the val would be too big to unpack: */
-	EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX);
-
-	pack_state_finish(&out_s, out);
-	out->u64s	= out_f->key_u64s + in->u64s - in_f->key_u64s;
-	out->needs_whiteout = in->needs_whiteout;
-	out->type	= in->type;
-
-	return true;
-}
-
-bool bch2_bkey_transform(const struct bkey_format *out_f,
-			struct bkey_packed *out,
-			const struct bkey_format *in_f,
-			const struct bkey_packed *in)
-{
-	if (!bch2_bkey_transform_key(out_f, out, in_f, in))
-		return false;
-
-	memcpy_u64s((u64 *) out + out_f->key_u64s,
-		    (u64 *) in + in_f->key_u64s,
-		    (in->u64s - in_f->key_u64s));
-	return true;
-}
-
-struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format,
-			      const struct bkey_packed *in)
-{
-	struct unpack_state state = unpack_state_init(format, in);
-	struct bkey out;
-
-	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
-	EBUG_ON(in->u64s < format->key_u64s);
-	EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
-	EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX);
-
-	out.u64s	= BKEY_U64s + in->u64s - format->key_u64s;
-	out.format	= KEY_FORMAT_CURRENT;
-	out.needs_whiteout = in->needs_whiteout;
-	out.type	= in->type;
-	out.pad[0]	= 0;
-
-#define x(id, field)	out.field = get_inc_field(&state, id);
-	bkey_fields()
-#undef x
-
-	return out;
-}
-
-#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
-struct bpos __bkey_unpack_pos(const struct bkey_format *format,
-				     const struct bkey_packed *in)
-{
-	struct unpack_state state = unpack_state_init(format, in);
-	struct bpos out;
-
-	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
-	EBUG_ON(in->u64s < format->key_u64s);
-	EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
-
-	out.inode	= get_inc_field(&state, BKEY_FIELD_INODE);
-	out.offset	= get_inc_field(&state, BKEY_FIELD_OFFSET);
-	out.snapshot	= get_inc_field(&state, BKEY_FIELD_SNAPSHOT);
-
-	return out;
-}
-#endif
-
-/**
- * bch2_bkey_pack_key -- pack just the key, not the value
- * @out:	packed result
- * @in:		key to pack
- * @format:	format of packed result
- *
- * Returns: true on success, false on failure
- */
-bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
-			const struct bkey_format *format)
-{
-	struct pack_state state = pack_state_init(format, out);
-	u64 *w = out->_data;
-
-	EBUG_ON((void *) in == (void *) out);
-	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
-	EBUG_ON(in->format != KEY_FORMAT_CURRENT);
-
-	*w = 0;
-
-#define x(id, field)	if (!set_inc_field(&state, id, in->field)) return false;
-	bkey_fields()
-#undef x
-	pack_state_finish(&state, out);
-	out->u64s	= format->key_u64s + in->u64s - BKEY_U64s;
-	out->format	= KEY_FORMAT_LOCAL_BTREE;
-	out->needs_whiteout = in->needs_whiteout;
-	out->type	= in->type;
-
-	bch2_bkey_pack_verify(out, in, format);
-	return true;
-}
-
-/**
- * bch2_bkey_unpack -- unpack the key and the value
- * @b:		btree node of @src key (for packed format)
- * @dst:	unpacked result
- * @src:	packed input
- */
-void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
-		      const struct bkey_packed *src)
-{
-	__bkey_unpack_key(b, &dst->k, src);
-
-	memcpy_u64s(&dst->v,
-		    bkeyp_val(&b->format, src),
-		    bkeyp_val_u64s(&b->format, src));
-}
-
-/**
- * bch2_bkey_pack -- pack the key and the value
- * @dst:	packed result
- * @src:	unpacked input
- * @format:	format of packed result
- *
- * Returns: true on success, false on failure
- */
-bool bch2_bkey_pack(struct bkey_packed *dst, const struct bkey_i *src,
-		    const struct bkey_format *format)
-{
-	struct bkey_packed tmp;
-
-	if (!bch2_bkey_pack_key(&tmp, &src->k, format))
-		return false;
-
-	memmove_u64s((u64 *) dst + format->key_u64s,
-		     &src->v,
-		     bkey_val_u64s(&src->k));
-	memcpy_u64s_small(dst, &tmp, format->key_u64s);
-
-	return true;
-}
-
-__always_inline
-static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v)
-{
-	unsigned bits = state->format->bits_per_field[field];
-	u64 offset = le64_to_cpu(state->format->field_offset[field]);
-	bool ret = true;
-
-	EBUG_ON(v < offset);
-	v -= offset;
-
-	if (fls64(v) > bits) {
-		v = ~(~0ULL << bits);
-		ret = false;
-	}
-
-	__set_inc_field(state, field, v);
-	return ret;
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-static bool bkey_packed_successor(struct bkey_packed *out,
-				  const struct btree *b,
-				  struct bkey_packed k)
-{
-	const struct bkey_format *f = &b->format;
-	unsigned nr_key_bits = b->nr_key_bits;
-	unsigned first_bit, offset;
-	u64 *p;
-
-	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
-
-	if (!nr_key_bits)
-		return false;
-
-	*out = k;
-
-	first_bit = high_bit_offset + nr_key_bits - 1;
-	p = nth_word(high_word(f, out), first_bit >> 6);
-	offset = 63 - (first_bit & 63);
-
-	while (nr_key_bits) {
-		unsigned bits = min(64 - offset, nr_key_bits);
-		u64 mask = (~0ULL >> (64 - bits)) << offset;
-
-		if ((*p & mask) != mask) {
-			*p += 1ULL << offset;
-			EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0);
-			return true;
-		}
-
-		*p &= ~mask;
-		p = prev_word(p);
-		nr_key_bits -= bits;
-		offset = 0;
-	}
-
-	return false;
-}
-
-static bool bkey_format_has_too_big_fields(const struct bkey_format *f)
-{
-	for (unsigned i = 0; i < f->nr_fields; i++) {
-		unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
-		u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
-		u64 packed_max = f->bits_per_field[i]
-			? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
-			: 0;
-		u64 field_offset = le64_to_cpu(f->field_offset[i]);
-
-		if (packed_max + field_offset < packed_max ||
-		    packed_max + field_offset > unpacked_max)
-			return true;
-	}
-
-	return false;
-}
-#endif
-
-/*
- * Returns a packed key that compares <= in
- *
- * This is used in bset_search_tree(), where we need a packed pos in order to be
- * able to compare against the keys in the auxiliary search tree - and it's
- * legal to use a packed pos that isn't equivalent to the original pos,
- * _provided_ it compares <= to the original pos.
- */
-enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
-					   struct bpos in,
-					   const struct btree *b)
-{
-	const struct bkey_format *f = &b->format;
-	struct pack_state state = pack_state_init(f, out);
-	u64 *w = out->_data;
-#ifdef CONFIG_BCACHEFS_DEBUG
-	struct bpos orig = in;
-#endif
-	bool exact = true;
-	unsigned i;
-
-	/*
-	 * bch2_bkey_pack_key() will write to all of f->key_u64s, minus the 3
-	 * byte header, but pack_pos() won't if the len/version fields are big
-	 * enough - we need to make sure to zero them out:
-	 */
-	for (i = 0; i < f->key_u64s; i++)
-		w[i] = 0;
-
-	if (unlikely(in.snapshot <
-		     le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) {
-		if (!in.offset-- &&
-		    !in.inode--)
-			return BKEY_PACK_POS_FAIL;
-		in.snapshot	= KEY_SNAPSHOT_MAX;
-		exact = false;
-	}
-
-	if (unlikely(in.offset <
-		     le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) {
-		if (!in.inode--)
-			return BKEY_PACK_POS_FAIL;
-		in.offset	= KEY_OFFSET_MAX;
-		in.snapshot	= KEY_SNAPSHOT_MAX;
-		exact = false;
-	}
-
-	if (unlikely(in.inode <
-		     le64_to_cpu(f->field_offset[BKEY_FIELD_INODE])))
-		return BKEY_PACK_POS_FAIL;
-
-	if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode))) {
-		in.offset	= KEY_OFFSET_MAX;
-		in.snapshot	= KEY_SNAPSHOT_MAX;
-		exact = false;
-	}
-
-	if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset))) {
-		in.snapshot	= KEY_SNAPSHOT_MAX;
-		exact = false;
-	}
-
-	if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot)))
-		exact = false;
-
-	pack_state_finish(&state, out);
-	out->u64s	= f->key_u64s;
-	out->format	= KEY_FORMAT_LOCAL_BTREE;
-	out->type	= KEY_TYPE_deleted;
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-	if (exact) {
-		BUG_ON(bkey_cmp_left_packed(b, out, &orig));
-	} else {
-		struct bkey_packed successor;
-
-		BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
-		BUG_ON(bkey_packed_successor(&successor, b, *out) &&
-		       bkey_cmp_left_packed(b, &successor, &orig) < 0 &&
-		       !bkey_format_has_too_big_fields(f));
-	}
-#endif
-
-	return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER;
-}
-
-void bch2_bkey_format_init(struct bkey_format_state *s)
-{
-	unsigned i;
-
-	for (i = 0; i < ARRAY_SIZE(s->field_min); i++)
-		s->field_min[i] = U64_MAX;
-
-	for (i = 0; i < ARRAY_SIZE(s->field_max); i++)
-		s->field_max[i] = 0;
-
-	/* Make sure we can store a size of 0: */
-	s->field_min[BKEY_FIELD_SIZE] = 0;
-}
-
-void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
-{
-	unsigned field = 0;
-
-	__bkey_format_add(s, field++, p.inode);
-	__bkey_format_add(s, field++, p.offset);
-	__bkey_format_add(s, field++, p.snapshot);
-}
-
-/*
- * We don't want it to be possible for the packed format to represent fields
- * bigger than a u64... that will cause confusion and issues (like with
- * bkey_packed_successor())
- */
-static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i,
-			     unsigned bits, u64 offset)
-{
-	unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
-	u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
-
-	bits = min(bits, unpacked_bits);
-
-	offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1));
-
-	f->bits_per_field[i]	= bits;
-	f->field_offset[i]	= cpu_to_le64(offset);
-}
-
-struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
-{
-	unsigned i, bits = KEY_PACKED_BITS_START;
-	struct bkey_format ret = {
-		.nr_fields = BKEY_NR_FIELDS,
-	};
-
-	for (i = 0; i < ARRAY_SIZE(s->field_min); i++) {
-		s->field_min[i] = min(s->field_min[i], s->field_max[i]);
-
-		set_format_field(&ret, i,
-				 fls64(s->field_max[i] - s->field_min[i]),
-				 s->field_min[i]);
-
-		bits += ret.bits_per_field[i];
-	}
-
-	/* allow for extent merging: */
-	if (ret.bits_per_field[BKEY_FIELD_SIZE]) {
-		unsigned b = min(4U, 32U - ret.bits_per_field[BKEY_FIELD_SIZE]);
-
-		ret.bits_per_field[BKEY_FIELD_SIZE] += b;
-		bits += b;
-	}
-
-	ret.key_u64s = DIV_ROUND_UP(bits, 64);
-
-	/* if we have enough spare bits, round fields up to nearest byte */
-	bits = ret.key_u64s * 64 - bits;
-
-	for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) {
-		unsigned r = round_up(ret.bits_per_field[i], 8) -
-			ret.bits_per_field[i];
-
-		if (r <= bits) {
-			set_format_field(&ret, i,
-					 ret.bits_per_field[i] + r,
-					 le64_to_cpu(ret.field_offset[i]));
-			bits -= r;
-		}
-	}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-	{
-		struct printbuf buf = PRINTBUF;
-
-		BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf));
-		printbuf_exit(&buf);
-	}
-#endif
-	return ret;
-}
-
-int bch2_bkey_format_invalid(struct bch_fs *c,
-			     struct bkey_format *f,
-			     enum bch_validate_flags flags,
-			     struct printbuf *err)
-{
-	unsigned i, bits = KEY_PACKED_BITS_START;
-
-	if (f->nr_fields != BKEY_NR_FIELDS) {
-		prt_printf(err, "incorrect number of fields: got %u, should be %u",
-			   f->nr_fields, BKEY_NR_FIELDS);
-		return -BCH_ERR_invalid;
-	}
-
-	/*
-	 * Verify that the packed format can't represent fields larger than the
-	 * unpacked format:
-	 */
-	for (i = 0; i < f->nr_fields; i++) {
-		if ((!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) &&
-		    bch2_bkey_format_field_overflows(f, i)) {
-			unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
-			u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
-			u64 packed_max = f->bits_per_field[i]
-				? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
-				: 0;
-
-			prt_printf(err, "field %u too large: %llu + %llu > %llu",
-				   i, packed_max, le64_to_cpu(f->field_offset[i]), unpacked_max);
-			return -BCH_ERR_invalid;
-		}
-
-		bits += f->bits_per_field[i];
-	}
-
-	if (f->key_u64s != DIV_ROUND_UP(bits, 64)) {
-		prt_printf(err, "incorrect key_u64s: got %u, should be %u",
-			   f->key_u64s, DIV_ROUND_UP(bits, 64));
-		return -BCH_ERR_invalid;
-	}
-
-	return 0;
-}
-
-void bch2_bkey_format_to_text(struct printbuf *out, const struct bkey_format *f)
-{
-	prt_printf(out, "u64s %u fields ", f->key_u64s);
-
-	for (unsigned i = 0; i < ARRAY_SIZE(f->bits_per_field); i++) {
-		if (i)
-			prt_str(out, ", ");
-		prt_printf(out, "%u:%llu",
-			   f->bits_per_field[i],
-			   le64_to_cpu(f->field_offset[i]));
-	}
-}
-
-/*
- * Most significant differing bit
- * Bits are indexed from 0 - return is [0, nr_key_bits)
- */
-__pure
-unsigned bch2_bkey_greatest_differing_bit(const struct btree *b,
-					  const struct bkey_packed *l_k,
-					  const struct bkey_packed *r_k)
-{
-	const u64 *l = high_word(&b->format, l_k);
-	const u64 *r = high_word(&b->format, r_k);
-	unsigned nr_key_bits = b->nr_key_bits;
-	unsigned word_bits = 64 - high_bit_offset;
-	u64 l_v, r_v;
-
-	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
-
-	/* for big endian, skip past header */
-	l_v = *l & (~0ULL >> high_bit_offset);
-	r_v = *r & (~0ULL >> high_bit_offset);
-
-	while (nr_key_bits) {
-		if (nr_key_bits < word_bits) {
-			l_v >>= word_bits - nr_key_bits;
-			r_v >>= word_bits - nr_key_bits;
-			nr_key_bits = 0;
-		} else {
-			nr_key_bits -= word_bits;
-		}
-
-		if (l_v != r_v)
-			return fls64(l_v ^ r_v) - 1 + nr_key_bits;
-
-		l = next_word(l);
-		r = next_word(r);
-
-		l_v = *l;
-		r_v = *r;
-		word_bits = 64;
-	}
-
-	return 0;
-}
-
-/*
- * First set bit
- * Bits are indexed from 0 - return is [0, nr_key_bits)
- */
-__pure
-unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k)
-{
-	const u64 *p = high_word(&b->format, k);
-	unsigned nr_key_bits = b->nr_key_bits;
-	unsigned ret = 0, offset;
-
-	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
-
-	offset = nr_key_bits;
-	while (offset > 64) {
-		p = next_word(p);
-		offset -= 64;
-	}
-
-	offset = 64 - offset;
-
-	while (nr_key_bits) {
-		unsigned bits = nr_key_bits + offset < 64
-			? nr_key_bits
-			: 64 - offset;
-
-		u64 mask = (~0ULL >> (64 - bits)) << offset;
-
-		if (*p & mask)
-			return ret + __ffs64(*p & mask) - offset;
-
-		p = prev_word(p);
-		nr_key_bits -= bits;
-		ret += bits;
-		offset = 0;
-	}
-
-	return 0;
-}
-
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
-
-#define I(_x)			(*(out)++ = (_x))
-#define I1(i0)						I(i0)
-#define I2(i0, i1)		(I1(i0),		I(i1))
-#define I3(i0, i1, i2)		(I2(i0, i1),		I(i2))
-#define I4(i0, i1, i2, i3)	(I3(i0, i1, i2),	I(i3))
-#define I5(i0, i1, i2, i3, i4)	(I4(i0, i1, i2, i3),	I(i4))
-
-static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
-			      enum bch_bkey_fields field,
-			      unsigned dst_offset, unsigned dst_size,
-			      bool *eax_zeroed)
-{
-	unsigned bits = format->bits_per_field[field];
-	u64 offset = le64_to_cpu(format->field_offset[field]);
-	unsigned i, byte, bit_offset, align, shl, shr;
-
-	if (!bits && !offset) {
-		if (!*eax_zeroed) {
-			/* xor eax, eax */
-			I2(0x31, 0xc0);
-		}
-
-		*eax_zeroed = true;
-		goto set_field;
-	}
-
-	if (!bits) {
-		/* just return offset: */
-
-		switch (dst_size) {
-		case 8:
-			if (offset > S32_MAX) {
-				/* mov [rdi + dst_offset], offset */
-				I3(0xc7, 0x47, dst_offset);
-				memcpy(out, &offset, 4);
-				out += 4;
-
-				I3(0xc7, 0x47, dst_offset + 4);
-				memcpy(out, (void *) &offset + 4, 4);
-				out += 4;
-			} else {
-				/* mov [rdi + dst_offset], offset */
-				/* sign extended */
-				I4(0x48, 0xc7, 0x47, dst_offset);
-				memcpy(out, &offset, 4);
-				out += 4;
-			}
-			break;
-		case 4:
-			/* mov [rdi + dst_offset], offset */
-			I3(0xc7, 0x47, dst_offset);
-			memcpy(out, &offset, 4);
-			out += 4;
-			break;
-		default:
-			BUG();
-		}
-
-		return out;
-	}
-
-	bit_offset = format->key_u64s * 64;
-	for (i = 0; i <= field; i++)
-		bit_offset -= format->bits_per_field[i];
-
-	byte = bit_offset / 8;
-	bit_offset -= byte * 8;
-
-	*eax_zeroed = false;
-
-	if (bit_offset == 0 && bits == 8) {
-		/* movzx eax, BYTE PTR [rsi + imm8] */
-		I4(0x0f, 0xb6, 0x46, byte);
-	} else if (bit_offset == 0 && bits == 16) {
-		/* movzx eax, WORD PTR [rsi + imm8] */
-		I4(0x0f, 0xb7, 0x46, byte);
-	} else if (bit_offset + bits <= 32) {
-		align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
-		byte -= align;
-		bit_offset += align * 8;
-
-		BUG_ON(bit_offset + bits > 32);
-
-		/* mov eax, [rsi + imm8] */
-		I3(0x8b, 0x46, byte);
-
-		if (bit_offset) {
-			/* shr eax, imm8 */
-			I3(0xc1, 0xe8, bit_offset);
-		}
-
-		if (bit_offset + bits < 32) {
-			unsigned mask = ~0U >> (32 - bits);
-
-			/* and eax, imm32 */
-			I1(0x25);
-			memcpy(out, &mask, 4);
-			out += 4;
-		}
-	} else if (bit_offset + bits <= 64) {
-		align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7);
-		byte -= align;
-		bit_offset += align * 8;
-
-		BUG_ON(bit_offset + bits > 64);
-
-		/* mov rax, [rsi + imm8] */
-		I4(0x48, 0x8b, 0x46, byte);
-
-		shl = 64 - bit_offset - bits;
-		shr = bit_offset + shl;
-
-		if (shl) {
-			/* shl rax, imm8 */
-			I4(0x48, 0xc1, 0xe0, shl);
-		}
-
-		if (shr) {
-			/* shr rax, imm8 */
-			I4(0x48, 0xc1, 0xe8, shr);
-		}
-	} else {
-		align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
-		byte -= align;
-		bit_offset += align * 8;
-
-		BUG_ON(bit_offset + bits > 96);
-
-		/* mov rax, [rsi + byte] */
-		I4(0x48, 0x8b, 0x46, byte);
-
-		/* mov edx, [rsi + byte + 8] */
-		I3(0x8b, 0x56, byte + 8);
-
-		/* bits from next word: */
-		shr = bit_offset + bits - 64;
-		BUG_ON(shr > bit_offset);
-
-		/* shr rax, bit_offset */
-		I4(0x48, 0xc1, 0xe8, shr);
-
-		/* shl rdx, imm8 */
-		I4(0x48, 0xc1, 0xe2, 64 - shr);
-
-		/* or rax, rdx */
-		I3(0x48, 0x09, 0xd0);
-
-		shr = bit_offset - shr;
-
-		if (shr) {
-			/* shr rax, imm8 */
-			I4(0x48, 0xc1, 0xe8, shr);
-		}
-	}
-
-	/* rax += offset: */
-	if (offset > S32_MAX) {
-		/* mov rdx, imm64 */
-		I2(0x48, 0xba);
-		memcpy(out, &offset, 8);
-		out += 8;
-		/* add %rdx, %rax */
-		I3(0x48, 0x01, 0xd0);
-	} else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) {
-		/* add rax, imm32 */
-		I2(0x48, 0x05);
-		memcpy(out, &offset, 4);
-		out += 4;
-	} else if (offset) {
-		/* add eax, imm32 */
-		I1(0x05);
-		memcpy(out, &offset, 4);
-		out += 4;
-	}
-set_field:
-	switch (dst_size) {
-	case 8:
-		/* mov [rdi + dst_offset], rax */
-		I4(0x48, 0x89, 0x47, dst_offset);
-		break;
-	case 4:
-		/* mov [rdi + dst_offset], eax */
-		I3(0x89, 0x47, dst_offset);
-		break;
-	default:
-		BUG();
-	}
-
-	return out;
-}
-
-int bch2_compile_bkey_format(const struct bkey_format *format, void *_out)
-{
-	bool eax_zeroed = false;
-	u8 *out = _out;
-
-	/*
-	 * rdi: dst - unpacked key
-	 * rsi: src - packed key
-	 */
-
-	/* k->u64s, k->format, k->type */
-
-	/* mov eax, [rsi] */
-	I2(0x8b, 0x06);
-
-	/* add eax, BKEY_U64s - format->key_u64s */
-	I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0);
-
-	/* and eax, imm32: mask out k->pad: */
-	I5(0x25, 0xff, 0xff, 0xff, 0);
-
-	/* mov [rdi], eax */
-	I2(0x89, 0x07);
-
-#define x(id, field)							\
-	out = compile_bkey_field(format, out, id,			\
-				 offsetof(struct bkey, field),		\
-				 sizeof(((struct bkey *) NULL)->field),	\
-				 &eax_zeroed);
-	bkey_fields()
-#undef x
-
-	/* retq */
-	I1(0xc3);
-
-	return (void *) out - _out;
-}
-
-#else
-#endif
-
-__pure
-int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l,
-					  const struct bkey_packed *r,
-					  const struct btree *b)
-{
-	return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
-}
-
-__pure __flatten
-int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b,
-					       const struct bkey_packed *l,
-					       const struct bpos *r)
-{
-	return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r);
-}
-
-__pure __flatten
-int bch2_bkey_cmp_packed(const struct btree *b,
-			 const struct bkey_packed *l,
-			 const struct bkey_packed *r)
-{
-	return bch2_bkey_cmp_packed_inlined(b, l, r);
-}
-
-__pure __flatten
-int __bch2_bkey_cmp_left_packed(const struct btree *b,
-				const struct bkey_packed *l,
-				const struct bpos *r)
-{
-	const struct bkey *l_unpacked;
-
-	return unlikely(l_unpacked = packed_to_bkey_c(l))
-		? bpos_cmp(l_unpacked->p, *r)
-		: __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
-}
-
-void bch2_bpos_swab(struct bpos *p)
-{
-	u8 *l = (u8 *) p;
-	u8 *h = ((u8 *) &p[1]) - 1;
-
-	while (l < h) {
-		swap(*l, *h);
-		l++;
-		--h;
-	}
-}
-
-void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k)
-{
-	const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current;
-	u8 *l = k->key_start;
-	u8 *h = (u8 *) (k->_data + f->key_u64s) - 1;
-
-	while (l < h) {
-		swap(*l, *h);
-		l++;
-		--h;
-	}
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_bkey_pack_test(void)
-{
-	struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0);
-	struct bkey_packed p;
-
-	struct bkey_format test_format = {
-		.key_u64s	= 3,
-		.nr_fields	= BKEY_NR_FIELDS,
-		.bits_per_field = {
-			13,
-			64,
-			32,
-		},
-	};
-
-	struct unpack_state in_s =
-		unpack_state_init(&bch2_bkey_format_current, (void *) &t);
-	struct pack_state out_s = pack_state_init(&test_format, &p);
-	unsigned i;
-
-	for (i = 0; i < out_s.format->nr_fields; i++) {
-		u64 a, v = get_inc_field(&in_s, i);
-
-		switch (i) {
-#define x(id, field)	case id: a = t.field; break;
-	bkey_fields()
-#undef x
-		default:
-			BUG();
-		}
-
-		if (a != v)
-			panic("got %llu actual %llu i %u\n", v, a, i);
-
-		if (!set_inc_field(&out_s, i, v))
-			panic("failed at %u\n", i);
-	}
-
-	BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format));
-}
-#endif
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
deleted file mode 100644
index fcd43915df07..000000000000
--- a/fs/bcachefs/bkey.h
+++ /dev/null
@@ -1,604 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_H
-#define _BCACHEFS_BKEY_H
-
-#include <linux/bug.h>
-#include "bcachefs_format.h"
-#include "bkey_types.h"
-#include "btree_types.h"
-#include "util.h"
-#include "vstructs.h"
-
-enum bch_validate_flags {
-	BCH_VALIDATE_write		= (1U << 0),
-	BCH_VALIDATE_commit		= (1U << 1),
-	BCH_VALIDATE_journal		= (1U << 2),
-};
-
-#if 0
-
-/*
- * compiled unpack functions are disabled, pending a new interface for
- * dynamically allocating executable memory:
- */
-
-#ifdef CONFIG_X86_64
-#define HAVE_BCACHEFS_COMPILED_UNPACK	1
-#endif
-#endif
-
-void bch2_bkey_packed_to_binary_text(struct printbuf *,
-				     const struct bkey_format *,
-				     const struct bkey_packed *);
-
-enum bkey_lr_packed {
-	BKEY_PACKED_BOTH,
-	BKEY_PACKED_RIGHT,
-	BKEY_PACKED_LEFT,
-	BKEY_PACKED_NONE,
-};
-
-#define bkey_lr_packed(_l, _r)						\
-	((_l)->format + ((_r)->format << 1))
-
-static inline void bkey_p_copy(struct bkey_packed *dst, const struct bkey_packed *src)
-{
-	memcpy_u64s_small(dst, src, src->u64s);
-}
-
-static inline void bkey_copy(struct bkey_i *dst, const struct bkey_i *src)
-{
-	memcpy_u64s_small(dst, src, src->k.u64s);
-}
-
-struct btree;
-
-__pure
-unsigned bch2_bkey_greatest_differing_bit(const struct btree *,
-					  const struct bkey_packed *,
-					  const struct bkey_packed *);
-__pure
-unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *);
-
-__pure
-int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *,
-				     const struct bkey_packed *,
-				     const struct btree *);
-
-__pure
-int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *,
-					  const struct bkey_packed *,
-					  const struct bpos *);
-
-__pure
-int bch2_bkey_cmp_packed(const struct btree *,
-			 const struct bkey_packed *,
-			 const struct bkey_packed *);
-
-__pure
-int __bch2_bkey_cmp_left_packed(const struct btree *,
-				const struct bkey_packed *,
-				const struct bpos *);
-
-static inline __pure
-int bkey_cmp_left_packed(const struct btree *b,
-			 const struct bkey_packed *l, const struct bpos *r)
-{
-	return __bch2_bkey_cmp_left_packed(b, l, r);
-}
-
-/*
- * The compiler generates better code when we pass bpos by ref, but it's often
- * enough terribly convenient to pass it by val... as much as I hate c++, const
- * ref would be nice here:
- */
-__pure __flatten
-static inline int bkey_cmp_left_packed_byval(const struct btree *b,
-					     const struct bkey_packed *l,
-					     struct bpos r)
-{
-	return bkey_cmp_left_packed(b, l, &r);
-}
-
-static __always_inline bool bpos_eq(struct bpos l, struct bpos r)
-{
-	return  !((l.inode	^ r.inode) |
-		  (l.offset	^ r.offset) |
-		  (l.snapshot	^ r.snapshot));
-}
-
-static __always_inline bool bpos_lt(struct bpos l, struct bpos r)
-{
-	return  l.inode	!= r.inode ? l.inode < r.inode :
-		l.offset != r.offset ? l.offset < r.offset :
-		l.snapshot != r.snapshot ? l.snapshot < r.snapshot : false;
-}
-
-static __always_inline bool bpos_le(struct bpos l, struct bpos r)
-{
-	return  l.inode	!= r.inode ? l.inode < r.inode :
-		l.offset != r.offset ? l.offset < r.offset :
-		l.snapshot != r.snapshot ? l.snapshot < r.snapshot : true;
-}
-
-static __always_inline bool bpos_gt(struct bpos l, struct bpos r)
-{
-	return bpos_lt(r, l);
-}
-
-static __always_inline bool bpos_ge(struct bpos l, struct bpos r)
-{
-	return bpos_le(r, l);
-}
-
-static __always_inline int bpos_cmp(struct bpos l, struct bpos r)
-{
-	return  cmp_int(l.inode,    r.inode) ?:
-		cmp_int(l.offset,   r.offset) ?:
-		cmp_int(l.snapshot, r.snapshot);
-}
-
-static inline struct bpos bpos_min(struct bpos l, struct bpos r)
-{
-	return bpos_lt(l, r) ? l : r;
-}
-
-static inline struct bpos bpos_max(struct bpos l, struct bpos r)
-{
-	return bpos_gt(l, r) ? l : r;
-}
-
-static __always_inline bool bkey_eq(struct bpos l, struct bpos r)
-{
-	return  !((l.inode	^ r.inode) |
-		  (l.offset	^ r.offset));
-}
-
-static __always_inline bool bkey_lt(struct bpos l, struct bpos r)
-{
-	return  l.inode	!= r.inode
-		? l.inode < r.inode
-		: l.offset < r.offset;
-}
-
-static __always_inline bool bkey_le(struct bpos l, struct bpos r)
-{
-	return  l.inode	!= r.inode
-		? l.inode < r.inode
-		: l.offset <= r.offset;
-}
-
-static __always_inline bool bkey_gt(struct bpos l, struct bpos r)
-{
-	return bkey_lt(r, l);
-}
-
-static __always_inline bool bkey_ge(struct bpos l, struct bpos r)
-{
-	return bkey_le(r, l);
-}
-
-static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
-{
-	return  cmp_int(l.inode,    r.inode) ?:
-		cmp_int(l.offset,   r.offset);
-}
-
-static inline struct bpos bkey_min(struct bpos l, struct bpos r)
-{
-	return bkey_lt(l, r) ? l : r;
-}
-
-static inline struct bpos bkey_max(struct bpos l, struct bpos r)
-{
-	return bkey_gt(l, r) ? l : r;
-}
-
-void bch2_bpos_swab(struct bpos *);
-void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
-
-static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
-{
-	return  cmp_int(l.hi, r.hi) ?:
-		cmp_int(l.lo, r.lo);
-}
-
-#define ZERO_VERSION	((struct bversion) { .hi = 0, .lo = 0 })
-#define MAX_VERSION	((struct bversion) { .hi = ~0, .lo = ~0ULL })
-
-static __always_inline int bversion_zero(struct bversion v)
-{
-	return !bversion_cmp(v, ZERO_VERSION);
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-/* statement expressions confusing unlikely()? */
-#define bkey_packed(_k)							\
-	({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT);			\
-	 (_k)->format != KEY_FORMAT_CURRENT; })
-#else
-#define bkey_packed(_k)		((_k)->format != KEY_FORMAT_CURRENT)
-#endif
-
-/*
- * It's safe to treat an unpacked bkey as a packed one, but not the reverse
- */
-static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k)
-{
-	return (struct bkey_packed *) k;
-}
-
-static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k)
-{
-	return (const struct bkey_packed *) k;
-}
-
-static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k)
-{
-	return bkey_packed(k) ? NULL : (struct bkey_i *) k;
-}
-
-static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k)
-{
-	return bkey_packed(k) ? NULL : (const struct bkey *) k;
-}
-
-static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
-{
-	return format->bits_per_field[BKEY_FIELD_INODE] +
-		format->bits_per_field[BKEY_FIELD_OFFSET] +
-		format->bits_per_field[BKEY_FIELD_SNAPSHOT];
-}
-
-static inline struct bpos bpos_successor(struct bpos p)
-{
-	if (!++p.snapshot &&
-	    !++p.offset &&
-	    !++p.inode)
-		BUG();
-
-	return p;
-}
-
-static inline struct bpos bpos_predecessor(struct bpos p)
-{
-	if (!p.snapshot-- &&
-	    !p.offset-- &&
-	    !p.inode--)
-		BUG();
-
-	return p;
-}
-
-static inline struct bpos bpos_nosnap_successor(struct bpos p)
-{
-	p.snapshot = 0;
-
-	if (!++p.offset &&
-	    !++p.inode)
-		BUG();
-
-	return p;
-}
-
-static inline struct bpos bpos_nosnap_predecessor(struct bpos p)
-{
-	p.snapshot = 0;
-
-	if (!p.offset-- &&
-	    !p.inode--)
-		BUG();
-
-	return p;
-}
-
-static inline u64 bkey_start_offset(const struct bkey *k)
-{
-	return k->p.offset - k->size;
-}
-
-static inline struct bpos bkey_start_pos(const struct bkey *k)
-{
-	return (struct bpos) {
-		.inode		= k->p.inode,
-		.offset		= bkey_start_offset(k),
-		.snapshot	= k->p.snapshot,
-	};
-}
-
-/* Packed helpers */
-
-static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
-				      const struct bkey_packed *k)
-{
-	return bkey_packed(k) ? format->key_u64s : BKEY_U64s;
-}
-
-static inline bool bkeyp_u64s_valid(const struct bkey_format *f,
-				    const struct bkey_packed *k)
-{
-	return ((unsigned) k->u64s - bkeyp_key_u64s(f, k) <= U8_MAX - BKEY_U64s);
-}
-
-static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
-				       const struct bkey_packed *k)
-{
-	return bkeyp_key_u64s(format, k) * sizeof(u64);
-}
-
-static inline unsigned bkeyp_val_u64s(const struct bkey_format *format,
-				      const struct bkey_packed *k)
-{
-	return k->u64s - bkeyp_key_u64s(format, k);
-}
-
-static inline size_t bkeyp_val_bytes(const struct bkey_format *format,
-				     const struct bkey_packed *k)
-{
-	return bkeyp_val_u64s(format, k) * sizeof(u64);
-}
-
-static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
-				      struct bkey_packed *k, unsigned val_u64s)
-{
-	k->u64s = bkeyp_key_u64s(format, k) + val_u64s;
-}
-
-#define bkeyp_val(_format, _k)						\
-	 ((struct bch_val *) ((u64 *) (_k)->_data + bkeyp_key_u64s(_format, _k)))
-
-extern const struct bkey_format bch2_bkey_format_current;
-
-bool bch2_bkey_transform(const struct bkey_format *,
-			 struct bkey_packed *,
-			 const struct bkey_format *,
-			 const struct bkey_packed *);
-
-struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
-				   const struct bkey_packed *);
-
-#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
-struct bpos __bkey_unpack_pos(const struct bkey_format *,
-			      const struct bkey_packed *);
-#endif
-
-bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *,
-		   const struct bkey_format *);
-
-enum bkey_pack_pos_ret {
-	BKEY_PACK_POS_EXACT,
-	BKEY_PACK_POS_SMALLER,
-	BKEY_PACK_POS_FAIL,
-};
-
-enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos,
-					   const struct btree *);
-
-static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in,
-				 const struct btree *b)
-{
-	return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT;
-}
-
-void bch2_bkey_unpack(const struct btree *, struct bkey_i *,
-		 const struct bkey_packed *);
-bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *,
-	       const struct bkey_format *);
-
-typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
-
-static inline void
-__bkey_unpack_key_format_checked(const struct btree *b,
-			       struct bkey *dst,
-			       const struct bkey_packed *src)
-{
-	if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) {
-		compiled_unpack_fn unpack_fn = b->aux_data;
-		unpack_fn(dst, src);
-
-		if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
-		    bch2_expensive_debug_checks) {
-			struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
-
-			BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
-		}
-	} else {
-		*dst = __bch2_bkey_unpack_key(&b->format, src);
-	}
-}
-
-static inline struct bkey
-bkey_unpack_key_format_checked(const struct btree *b,
-			       const struct bkey_packed *src)
-{
-	struct bkey dst;
-
-	__bkey_unpack_key_format_checked(b, &dst, src);
-	return dst;
-}
-
-static inline void __bkey_unpack_key(const struct btree *b,
-				     struct bkey *dst,
-				     const struct bkey_packed *src)
-{
-	if (likely(bkey_packed(src)))
-		__bkey_unpack_key_format_checked(b, dst, src);
-	else
-		*dst = *packed_to_bkey_c(src);
-}
-
-/**
- * bkey_unpack_key -- unpack just the key, not the value
- */
-static inline struct bkey bkey_unpack_key(const struct btree *b,
-					  const struct bkey_packed *src)
-{
-	return likely(bkey_packed(src))
-		? bkey_unpack_key_format_checked(b, src)
-		: *packed_to_bkey_c(src);
-}
-
-static inline struct bpos
-bkey_unpack_pos_format_checked(const struct btree *b,
-			       const struct bkey_packed *src)
-{
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
-	return bkey_unpack_key_format_checked(b, src).p;
-#else
-	return __bkey_unpack_pos(&b->format, src);
-#endif
-}
-
-static inline struct bpos bkey_unpack_pos(const struct btree *b,
-					  const struct bkey_packed *src)
-{
-	return likely(bkey_packed(src))
-		? bkey_unpack_pos_format_checked(b, src)
-		: packed_to_bkey_c(src)->p;
-}
-
-/* Disassembled bkeys */
-
-static inline struct bkey_s_c bkey_disassemble(const struct btree *b,
-					       const struct bkey_packed *k,
-					       struct bkey *u)
-{
-	__bkey_unpack_key(b, u, k);
-
-	return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
-}
-
-/* non const version: */
-static inline struct bkey_s __bkey_disassemble(const struct btree *b,
-					       struct bkey_packed *k,
-					       struct bkey *u)
-{
-	__bkey_unpack_key(b, u, k);
-
-	return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
-}
-
-static inline u64 bkey_field_max(const struct bkey_format *f,
-				 enum bch_bkey_fields nr)
-{
-	return f->bits_per_field[nr] < 64
-		? (le64_to_cpu(f->field_offset[nr]) +
-		   ~(~0ULL << f->bits_per_field[nr]))
-		: U64_MAX;
-}
-
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
-
-int bch2_compile_bkey_format(const struct bkey_format *, void *);
-
-#else
-
-static inline int bch2_compile_bkey_format(const struct bkey_format *format,
-					  void *out) { return 0; }
-
-#endif
-
-static inline void bkey_reassemble(struct bkey_i *dst,
-				   struct bkey_s_c src)
-{
-	dst->k = *src.k;
-	memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
-}
-
-/* byte order helpers */
-
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-
-static inline unsigned high_word_offset(const struct bkey_format *f)
-{
-	return f->key_u64s - 1;
-}
-
-#define high_bit_offset		0
-#define nth_word(p, n)		((p) - (n))
-
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-
-static inline unsigned high_word_offset(const struct bkey_format *f)
-{
-	return 0;
-}
-
-#define high_bit_offset		KEY_PACKED_BITS_START
-#define nth_word(p, n)		((p) + (n))
-
-#else
-#error edit for your odd byteorder.
-#endif
-
-#define high_word(f, k)		((u64 *) (k)->_data + high_word_offset(f))
-#define next_word(p)		nth_word(p, 1)
-#define prev_word(p)		nth_word(p, -1)
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_bkey_pack_test(void);
-#else
-static inline void bch2_bkey_pack_test(void) {}
-#endif
-
-#define bkey_fields()							\
-	x(BKEY_FIELD_INODE,		p.inode)			\
-	x(BKEY_FIELD_OFFSET,		p.offset)			\
-	x(BKEY_FIELD_SNAPSHOT,		p.snapshot)			\
-	x(BKEY_FIELD_SIZE,		size)				\
-	x(BKEY_FIELD_VERSION_HI,	version.hi)			\
-	x(BKEY_FIELD_VERSION_LO,	version.lo)
-
-struct bkey_format_state {
-	u64 field_min[BKEY_NR_FIELDS];
-	u64 field_max[BKEY_NR_FIELDS];
-};
-
-void bch2_bkey_format_init(struct bkey_format_state *);
-
-static inline void __bkey_format_add(struct bkey_format_state *s, unsigned field, u64 v)
-{
-	s->field_min[field] = min(s->field_min[field], v);
-	s->field_max[field] = max(s->field_max[field], v);
-}
-
-/*
- * Changes @format so that @k can be successfully packed with @format
- */
-static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k)
-{
-#define x(id, field) __bkey_format_add(s, id, k->field);
-	bkey_fields()
-#undef x
-}
-
-void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
-struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
-
-static inline bool bch2_bkey_format_field_overflows(struct bkey_format *f, unsigned i)
-{
-	unsigned f_bits = f->bits_per_field[i];
-	unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
-	u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
-	u64 field_offset = le64_to_cpu(f->field_offset[i]);
-
-	if (f_bits > unpacked_bits)
-		return true;
-
-	if ((f_bits == unpacked_bits) && field_offset)
-		return true;
-
-	u64 f_mask = f_bits
-		? ~((~0ULL << (f_bits - 1)) << 1)
-		: 0;
-
-	if (((field_offset + f_mask) & unpacked_mask) < field_offset)
-		return true;
-	return false;
-}
-
-int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *,
-			     enum bch_validate_flags, struct printbuf *);
-void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *);
-
-#endif /* _BCACHEFS_BKEY_H */
diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h
deleted file mode 100644
index a30c4ae8eb36..000000000000
--- a/fs/bcachefs/bkey_buf.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_BUF_H
-#define _BCACHEFS_BKEY_BUF_H
-
-#include "bcachefs.h"
-#include "bkey.h"
-
-struct bkey_buf {
-	struct bkey_i	*k;
-	u64		onstack[12];
-};
-
-static inline void bch2_bkey_buf_realloc(struct bkey_buf *s,
-					 struct bch_fs *c, unsigned u64s)
-{
-	if (s->k == (void *) s->onstack &&
-	    u64s > ARRAY_SIZE(s->onstack)) {
-		s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
-		memcpy(s->k, s->onstack, sizeof(s->onstack));
-	}
-}
-
-static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s,
-					    struct bch_fs *c,
-					    struct bkey_s_c k)
-{
-	bch2_bkey_buf_realloc(s, c, k.k->u64s);
-	bkey_reassemble(s->k, k);
-}
-
-static inline void bch2_bkey_buf_copy(struct bkey_buf *s,
-				      struct bch_fs *c,
-				      struct bkey_i *src)
-{
-	bch2_bkey_buf_realloc(s, c, src->k.u64s);
-	bkey_copy(s->k, src);
-}
-
-static inline void bch2_bkey_buf_unpack(struct bkey_buf *s,
-					struct bch_fs *c,
-					struct btree *b,
-					struct bkey_packed *src)
-{
-	bch2_bkey_buf_realloc(s, c, BKEY_U64s +
-			      bkeyp_val_u64s(&b->format, src));
-	bch2_bkey_unpack(b, s->k, src);
-}
-
-static inline void bch2_bkey_buf_init(struct bkey_buf *s)
-{
-	s->k = (void *) s->onstack;
-}
-
-static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c)
-{
-	if (s->k != (void *) s->onstack)
-		mempool_free(s->k, &c->large_bkey_pool);
-	s->k = NULL;
-}
-
-#endif /* _BCACHEFS_BKEY_BUF_H */
diff --git a/fs/bcachefs/bkey_cmp.h b/fs/bcachefs/bkey_cmp.h
deleted file mode 100644
index 5f42a6e69360..000000000000
--- a/fs/bcachefs/bkey_cmp.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_CMP_H
-#define _BCACHEFS_BKEY_CMP_H
-
-#include "bkey.h"
-
-#ifdef CONFIG_X86_64
-static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
-				  unsigned nr_key_bits)
-{
-	long d0, d1, d2, d3;
-	int cmp;
-
-	/* we shouldn't need asm for this, but gcc is being retarded: */
-
-	asm(".intel_syntax noprefix;"
-	    "xor eax, eax;"
-	    "xor edx, edx;"
-	    "1:;"
-	    "mov r8, [rdi];"
-	    "mov r9, [rsi];"
-	    "sub ecx, 64;"
-	    "jl 2f;"
-
-	    "cmp r8, r9;"
-	    "jnz 3f;"
-
-	    "lea rdi, [rdi - 8];"
-	    "lea rsi, [rsi - 8];"
-	    "jmp 1b;"
-
-	    "2:;"
-	    "not ecx;"
-	    "shr r8, 1;"
-	    "shr r9, 1;"
-	    "shr r8, cl;"
-	    "shr r9, cl;"
-	    "cmp r8, r9;"
-
-	    "3:\n"
-	    "seta al;"
-	    "setb dl;"
-	    "sub eax, edx;"
-	    ".att_syntax prefix;"
-	    : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
-	    : "0" (l), "1" (r), "3" (nr_key_bits)
-	    : "r8", "r9", "cc", "memory");
-
-	return cmp;
-}
-#else
-static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
-				  unsigned nr_key_bits)
-{
-	u64 l_v, r_v;
-
-	if (!nr_key_bits)
-		return 0;
-
-	/* for big endian, skip past header */
-	nr_key_bits += high_bit_offset;
-	l_v = *l & (~0ULL >> high_bit_offset);
-	r_v = *r & (~0ULL >> high_bit_offset);
-
-	while (1) {
-		if (nr_key_bits < 64) {
-			l_v >>= 64 - nr_key_bits;
-			r_v >>= 64 - nr_key_bits;
-			nr_key_bits = 0;
-		} else {
-			nr_key_bits -= 64;
-		}
-
-		if (!nr_key_bits || l_v != r_v)
-			break;
-
-		l = next_word(l);
-		r = next_word(r);
-
-		l_v = *l;
-		r_v = *r;
-	}
-
-	return cmp_int(l_v, r_v);
-}
-#endif
-
-static inline __pure __flatten
-int __bch2_bkey_cmp_packed_format_checked_inlined(const struct bkey_packed *l,
-					  const struct bkey_packed *r,
-					  const struct btree *b)
-{
-	const struct bkey_format *f = &b->format;
-	int ret;
-
-	EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
-	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
-
-	ret = __bkey_cmp_bits(high_word(f, l),
-			      high_word(f, r),
-			      b->nr_key_bits);
-
-	EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
-				bkey_unpack_pos(b, r)));
-	return ret;
-}
-
-static inline __pure __flatten
-int bch2_bkey_cmp_packed_inlined(const struct btree *b,
-			 const struct bkey_packed *l,
-			 const struct bkey_packed *r)
-{
-	struct bkey unpacked;
-
-	if (likely(bkey_packed(l) && bkey_packed(r)))
-		return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
-
-	if (bkey_packed(l)) {
-		__bkey_unpack_key_format_checked(b, &unpacked, l);
-		l = (void *) &unpacked;
-	} else if (bkey_packed(r)) {
-		__bkey_unpack_key_format_checked(b, &unpacked, r);
-		r = (void *) &unpacked;
-	}
-
-	return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
-}
-
-#endif /* _BCACHEFS_BKEY_CMP_H */
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
deleted file mode 100644
index c2c3dae52186..000000000000
--- a/fs/bcachefs/bkey_methods.c
+++ /dev/null
@@ -1,478 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "backpointers.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-#include "btree_types.h"
-#include "alloc_background.h"
-#include "dirent.h"
-#include "ec.h"
-#include "error.h"
-#include "extents.h"
-#include "inode.h"
-#include "io_misc.h"
-#include "lru.h"
-#include "quota.h"
-#include "reflink.h"
-#include "snapshot.h"
-#include "subvolume.h"
-#include "xattr.h"
-
-const char * const bch2_bkey_types[] = {
-#define x(name, nr) #name,
-	BCH_BKEY_TYPES()
-#undef x
-	NULL
-};
-
-static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k,
-			       enum bch_validate_flags flags, struct printbuf *err)
-{
-	return 0;
-}
-
-#define bch2_bkey_ops_deleted ((struct bkey_ops) {	\
-	.key_invalid = deleted_key_invalid,		\
-})
-
-#define bch2_bkey_ops_whiteout ((struct bkey_ops) {	\
-	.key_invalid = deleted_key_invalid,		\
-})
-
-static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k,
-				 enum bch_validate_flags flags, struct printbuf *err)
-{
-	int ret = 0;
-
-	bkey_fsck_err_on(bkey_val_bytes(k.k), c, err,
-			 bkey_val_size_nonzero,
-			 "incorrect value size (%zu != 0)",
-			 bkey_val_bytes(k.k));
-fsck_err:
-	return ret;
-}
-
-#define bch2_bkey_ops_error ((struct bkey_ops) {	\
-	.key_invalid = empty_val_key_invalid,		\
-})
-
-static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k,
-				   enum bch_validate_flags flags, struct printbuf *err)
-{
-	return 0;
-}
-
-static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c,
-				    struct bkey_s_c k)
-{
-	struct bkey_s_c_cookie ck = bkey_s_c_to_cookie(k);
-
-	prt_printf(out, "%llu", le64_to_cpu(ck.v->cookie));
-}
-
-#define bch2_bkey_ops_cookie ((struct bkey_ops) {	\
-	.key_invalid	= key_type_cookie_invalid,	\
-	.val_to_text	= key_type_cookie_to_text,	\
-	.min_val_size	= 8,				\
-})
-
-#define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\
-	.key_invalid = empty_val_key_invalid,		\
-})
-
-static int key_type_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k,
-					enum bch_validate_flags flags, struct printbuf *err)
-{
-	return 0;
-}
-
-static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
-					 struct bkey_s_c k)
-{
-	struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
-	unsigned datalen = bkey_inline_data_bytes(k.k);
-
-	prt_printf(out, "datalen %u: %*phN",
-	       datalen, min(datalen, 32U), d.v->data);
-}
-
-#define bch2_bkey_ops_inline_data ((struct bkey_ops) {	\
-	.key_invalid	= key_type_inline_data_invalid,	\
-	.val_to_text	= key_type_inline_data_to_text,	\
-})
-
-static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
-{
-	bch2_key_resize(l.k, l.k->size + r.k->size);
-	return true;
-}
-
-#define bch2_bkey_ops_set ((struct bkey_ops) {		\
-	.key_invalid	= empty_val_key_invalid,	\
-	.key_merge	= key_type_set_merge,		\
-})
-
-const struct bkey_ops bch2_bkey_ops[] = {
-#define x(name, nr) [KEY_TYPE_##name]	= bch2_bkey_ops_##name,
-	BCH_BKEY_TYPES()
-#undef x
-};
-
-const struct bkey_ops bch2_bkey_null_ops = {
-};
-
-int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
-			  enum bch_validate_flags flags,
-			  struct printbuf *err)
-{
-	if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
-		return 0;
-
-	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
-	int ret = 0;
-
-	bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, c, err,
-			 bkey_val_size_too_small,
-			 "bad val size (%zu < %u)",
-			 bkey_val_bytes(k.k), ops->min_val_size);
-
-	if (!ops->key_invalid)
-		return 0;
-
-	ret = ops->key_invalid(c, k, flags, err);
-fsck_err:
-	return ret;
-}
-
-static u64 bch2_key_types_allowed[] = {
-	[BKEY_TYPE_btree] =
-		BIT_ULL(KEY_TYPE_deleted)|
-		BIT_ULL(KEY_TYPE_btree_ptr)|
-		BIT_ULL(KEY_TYPE_btree_ptr_v2),
-#define x(name, nr, flags, keys)	[BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys,
-	BCH_BTREE_IDS()
-#undef x
-};
-
-const char *bch2_btree_node_type_str(enum btree_node_type type)
-{
-	return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1);
-}
-
-int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
-			enum btree_node_type type,
-			enum bch_validate_flags flags,
-			struct printbuf *err)
-{
-	if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
-		return 0;
-
-	int ret = 0;
-
-	bkey_fsck_err_on(k.k->u64s < BKEY_U64s, c, err,
-			 bkey_u64s_too_small,
-			 "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
-
-	if (type >= BKEY_TYPE_NR)
-		return 0;
-
-	bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX &&
-			 (type == BKEY_TYPE_btree || (flags & BCH_VALIDATE_commit)) &&
-			 !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err,
-			 bkey_invalid_type_for_btree,
-			 "invalid key type for btree %s (%s)",
-			 bch2_btree_node_type_str(type),
-			 k.k->type < KEY_TYPE_MAX
-			 ? bch2_bkey_types[k.k->type]
-			 : "(unknown)");
-
-	if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
-		bkey_fsck_err_on(k.k->size == 0, c, err,
-				 bkey_extent_size_zero,
-				 "size == 0");
-
-		bkey_fsck_err_on(k.k->size > k.k->p.offset, c, err,
-				 bkey_extent_size_greater_than_offset,
-				 "size greater than offset (%u > %llu)",
-				 k.k->size, k.k->p.offset);
-	} else {
-		bkey_fsck_err_on(k.k->size, c, err,
-				 bkey_size_nonzero,
-				 "size != 0");
-	}
-
-	if (type != BKEY_TYPE_btree) {
-		enum btree_id btree = type - 1;
-
-		if (btree_type_has_snapshots(btree)) {
-			bkey_fsck_err_on(!k.k->p.snapshot, c, err,
-					 bkey_snapshot_zero,
-					 "snapshot == 0");
-		} else if (!btree_type_has_snapshot_field(btree)) {
-			bkey_fsck_err_on(k.k->p.snapshot, c, err,
-					 bkey_snapshot_nonzero,
-					 "nonzero snapshot");
-		} else {
-			/*
-			 * btree uses snapshot field but it's not required to be
-			 * nonzero
-			 */
-		}
-
-		bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), c, err,
-				 bkey_at_pos_max,
-				 "key at POS_MAX");
-	}
-fsck_err:
-	return ret;
-}
-
-int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
-		      enum btree_node_type type,
-		      enum bch_validate_flags flags,
-		      struct printbuf *err)
-{
-	return __bch2_bkey_invalid(c, k, type, flags, err) ?:
-		bch2_bkey_val_invalid(c, k, flags, err);
-}
-
-int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b,
-			    struct bkey_s_c k, struct printbuf *err)
-{
-	int ret = 0;
-
-	bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), c, err,
-			 bkey_before_start_of_btree_node,
-			 "key before start of btree node");
-
-	bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), c, err,
-			 bkey_after_end_of_btree_node,
-			 "key past end of btree node");
-fsck_err:
-	return ret;
-}
-
-void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
-{
-	if (bpos_eq(pos, POS_MIN))
-		prt_printf(out, "POS_MIN");
-	else if (bpos_eq(pos, POS_MAX))
-		prt_printf(out, "POS_MAX");
-	else if (bpos_eq(pos, SPOS_MAX))
-		prt_printf(out, "SPOS_MAX");
-	else {
-		if (pos.inode == U64_MAX)
-			prt_printf(out, "U64_MAX");
-		else
-			prt_printf(out, "%llu", pos.inode);
-		prt_printf(out, ":");
-		if (pos.offset == U64_MAX)
-			prt_printf(out, "U64_MAX");
-		else
-			prt_printf(out, "%llu", pos.offset);
-		prt_printf(out, ":");
-		if (pos.snapshot == U32_MAX)
-			prt_printf(out, "U32_MAX");
-		else
-			prt_printf(out, "%u", pos.snapshot);
-	}
-}
-
-void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
-{
-	if (k) {
-		prt_printf(out, "u64s %u type ", k->u64s);
-
-		if (k->type < KEY_TYPE_MAX)
-			prt_printf(out, "%s ", bch2_bkey_types[k->type]);
-		else
-			prt_printf(out, "%u ", k->type);
-
-		bch2_bpos_to_text(out, k->p);
-
-		prt_printf(out, " len %u ver %llu", k->size, k->version.lo);
-	} else {
-		prt_printf(out, "(null)");
-	}
-}
-
-void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
-		      struct bkey_s_c k)
-{
-	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
-
-	if (likely(ops->val_to_text))
-		ops->val_to_text(out, c, k);
-}
-
-void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
-			   struct bkey_s_c k)
-{
-	bch2_bkey_to_text(out, k.k);
-
-	if (bkey_val_bytes(k.k)) {
-		prt_printf(out, ": ");
-		bch2_val_to_text(out, c, k);
-	}
-}
-
-void bch2_bkey_swab_val(struct bkey_s k)
-{
-	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
-
-	if (ops->swab)
-		ops->swab(k);
-}
-
-bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k)
-{
-	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
-
-	return ops->key_normalize
-		? ops->key_normalize(c, k)
-		: false;
-}
-
-bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
-{
-	const struct bkey_ops *ops = bch2_bkey_type_ops(l.k->type);
-
-	return ops->key_merge &&
-		bch2_bkey_maybe_mergable(l.k, r.k) &&
-		(u64) l.k->size + r.k->size <= KEY_SIZE_MAX &&
-		!bch2_key_merging_disabled &&
-		ops->key_merge(c, l, r);
-}
-
-static const struct old_bkey_type {
-	u8		btree_node_type;
-	u8		old;
-	u8		new;
-} bkey_renumber_table[] = {
-	{BKEY_TYPE_btree,	128, KEY_TYPE_btree_ptr		},
-	{BKEY_TYPE_extents,	128, KEY_TYPE_extent		},
-	{BKEY_TYPE_extents,	129, KEY_TYPE_extent		},
-	{BKEY_TYPE_extents,	130, KEY_TYPE_reservation	},
-	{BKEY_TYPE_inodes,	128, KEY_TYPE_inode		},
-	{BKEY_TYPE_inodes,	130, KEY_TYPE_inode_generation	},
-	{BKEY_TYPE_dirents,	128, KEY_TYPE_dirent		},
-	{BKEY_TYPE_dirents,	129, KEY_TYPE_hash_whiteout	},
-	{BKEY_TYPE_xattrs,	128, KEY_TYPE_xattr		},
-	{BKEY_TYPE_xattrs,	129, KEY_TYPE_hash_whiteout	},
-	{BKEY_TYPE_alloc,	128, KEY_TYPE_alloc		},
-	{BKEY_TYPE_quotas,	128, KEY_TYPE_quota		},
-};
-
-void bch2_bkey_renumber(enum btree_node_type btree_node_type,
-			struct bkey_packed *k,
-			int write)
-{
-	const struct old_bkey_type *i;
-
-	for (i = bkey_renumber_table;
-	     i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table);
-	     i++)
-		if (btree_node_type == i->btree_node_type &&
-		    k->type == (write ? i->new : i->old)) {
-			k->type = write ? i->old : i->new;
-			break;
-		}
-}
-
-void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
-			unsigned version, unsigned big_endian,
-			int write,
-			struct bkey_format *f,
-			struct bkey_packed *k)
-{
-	const struct bkey_ops *ops;
-	struct bkey uk;
-	unsigned nr_compat = 5;
-	int i;
-
-	/*
-	 * Do these operations in reverse order in the write path:
-	 */
-
-	for (i = 0; i < nr_compat; i++)
-	switch (!write ? i : nr_compat - 1 - i) {
-	case 0:
-		if (big_endian != CPU_BIG_ENDIAN)
-			bch2_bkey_swab_key(f, k);
-		break;
-	case 1:
-		if (version < bcachefs_metadata_version_bkey_renumber)
-			bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write);
-		break;
-	case 2:
-		if (version < bcachefs_metadata_version_inode_btree_change &&
-		    btree_id == BTREE_ID_inodes) {
-			if (!bkey_packed(k)) {
-				struct bkey_i *u = packed_to_bkey(k);
-
-				swap(u->k.p.inode, u->k.p.offset);
-			} else if (f->bits_per_field[BKEY_FIELD_INODE] &&
-				   f->bits_per_field[BKEY_FIELD_OFFSET]) {
-				struct bkey_format tmp = *f, *in = f, *out = &tmp;
-
-				swap(tmp.bits_per_field[BKEY_FIELD_INODE],
-				     tmp.bits_per_field[BKEY_FIELD_OFFSET]);
-				swap(tmp.field_offset[BKEY_FIELD_INODE],
-				     tmp.field_offset[BKEY_FIELD_OFFSET]);
-
-				if (!write)
-					swap(in, out);
-
-				uk = __bch2_bkey_unpack_key(in, k);
-				swap(uk.p.inode, uk.p.offset);
-				BUG_ON(!bch2_bkey_pack_key(k, &uk, out));
-			}
-		}
-		break;
-	case 3:
-		if (version < bcachefs_metadata_version_snapshot &&
-		    (level || btree_type_has_snapshots(btree_id))) {
-			struct bkey_i *u = packed_to_bkey(k);
-
-			if (u) {
-				u->k.p.snapshot = write
-					? 0 : U32_MAX;
-			} else {
-				u64 min_packed = le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]);
-				u64 max_packed = min_packed +
-					~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
-
-				uk = __bch2_bkey_unpack_key(f, k);
-				uk.p.snapshot = write
-					? min_packed : min_t(u64, U32_MAX, max_packed);
-
-				BUG_ON(!bch2_bkey_pack_key(k, &uk, f));
-			}
-		}
-
-		break;
-	case 4: {
-		struct bkey_s u;
-
-		if (!bkey_packed(k)) {
-			u = bkey_i_to_s(packed_to_bkey(k));
-		} else {
-			uk = __bch2_bkey_unpack_key(f, k);
-			u.k = &uk;
-			u.v = bkeyp_val(f, k);
-		}
-
-		if (big_endian != CPU_BIG_ENDIAN)
-			bch2_bkey_swab_val(u);
-
-		ops = bch2_bkey_type_ops(k->type);
-
-		if (ops->compat)
-			ops->compat(btree_id, version, big_endian, write, u);
-		break;
-	}
-	default:
-		BUG();
-	}
-}
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
deleted file mode 100644
index 726ef7483763..000000000000
--- a/fs/bcachefs/bkey_methods.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_METHODS_H
-#define _BCACHEFS_BKEY_METHODS_H
-
-#include "bkey.h"
-
-struct bch_fs;
-struct btree;
-struct btree_trans;
-struct bkey;
-enum btree_node_type;
-
-extern const char * const bch2_bkey_types[];
-extern const struct bkey_ops bch2_bkey_null_ops;
-
-/*
- * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If
- * invalid, entire key will be deleted.
- *
- * When invalid, error string is returned via @err. @rw indicates whether key is
- * being read or written; more aggressive checks can be enabled when rw == WRITE.
- */
-struct bkey_ops {
-	int		(*key_invalid)(struct bch_fs *c, struct bkey_s_c k,
-				       enum bch_validate_flags flags, struct printbuf *err);
-	void		(*val_to_text)(struct printbuf *, struct bch_fs *,
-				       struct bkey_s_c);
-	void		(*swab)(struct bkey_s);
-	bool		(*key_normalize)(struct bch_fs *, struct bkey_s);
-	bool		(*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-	int		(*trigger)(struct btree_trans *, enum btree_id, unsigned,
-				   struct bkey_s_c, struct bkey_s,
-				   enum btree_iter_update_trigger_flags);
-	void		(*compat)(enum btree_id id, unsigned version,
-				  unsigned big_endian, int write,
-				  struct bkey_s);
-
-	/* Size of value type when first created: */
-	unsigned	min_val_size;
-};
-
-extern const struct bkey_ops bch2_bkey_ops[];
-
-static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type)
-{
-	return likely(type < KEY_TYPE_MAX)
-		? &bch2_bkey_ops[type]
-		: &bch2_bkey_null_ops;
-}
-
-int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c,
-			  enum bch_validate_flags, struct printbuf *);
-int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
-			enum bch_validate_flags, struct printbuf *);
-int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
-		      enum bch_validate_flags, struct printbuf *);
-int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *,
-			    struct bkey_s_c, struct printbuf *);
-
-void bch2_bpos_to_text(struct printbuf *, struct bpos);
-void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
-void bch2_val_to_text(struct printbuf *, struct bch_fs *,
-		      struct bkey_s_c);
-void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *,
-			   struct bkey_s_c);
-
-void bch2_bkey_swab_val(struct bkey_s);
-
-bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
-
-static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r)
-{
-	return l->type == r->type &&
-		!bversion_cmp(l->version, r->version) &&
-		bpos_eq(l->p, bkey_start_pos(r));
-}
-
-bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-
-static inline int bch2_key_trigger(struct btree_trans *trans,
-		enum btree_id btree, unsigned level,
-		struct bkey_s_c old, struct bkey_s new,
-		enum btree_iter_update_trigger_flags flags)
-{
-	const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type);
-
-	return ops->trigger
-		? ops->trigger(trans, btree, level, old, new, flags)
-		: 0;
-}
-
-static inline int bch2_key_trigger_old(struct btree_trans *trans,
-			enum btree_id btree_id, unsigned level,
-			struct bkey_s_c old,
-			enum btree_iter_update_trigger_flags flags)
-{
-	struct bkey_i deleted;
-
-	bkey_init(&deleted.k);
-	deleted.k.p = old.k->p;
-
-	return bch2_key_trigger(trans, btree_id, level, old, bkey_i_to_s(&deleted),
-				BTREE_TRIGGER_overwrite|flags);
-}
-
-static inline int bch2_key_trigger_new(struct btree_trans *trans,
-			enum btree_id btree_id, unsigned level,
-			struct bkey_s new,
-			enum btree_iter_update_trigger_flags flags)
-{
-	struct bkey_i deleted;
-
-	bkey_init(&deleted.k);
-	deleted.k.p = new.k->p;
-
-	return bch2_key_trigger(trans, btree_id, level, bkey_i_to_s_c(&deleted), new,
-				BTREE_TRIGGER_insert|flags);
-}
-
-void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
-
-void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned,
-			int, struct bkey_format *, struct bkey_packed *);
-
-static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id,
-			       unsigned version, unsigned big_endian,
-			       int write,
-			       struct bkey_format *f,
-			       struct bkey_packed *k)
-{
-	if (version < bcachefs_metadata_version_current ||
-	    big_endian != CPU_BIG_ENDIAN)
-		__bch2_bkey_compat(level, btree_id, version,
-				   big_endian, write, f, k);
-
-}
-
-#endif /* _BCACHEFS_BKEY_METHODS_H */
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
deleted file mode 100644
index 4536eb50fc40..000000000000
--- a/fs/bcachefs/bkey_sort.c
+++ /dev/null
@@ -1,214 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "bkey_cmp.h"
-#include "bkey_sort.h"
-#include "bset.h"
-#include "extents.h"
-
-typedef int (*sort_cmp_fn)(const struct btree *,
-			   const struct bkey_packed *,
-			   const struct bkey_packed *);
-
-static inline bool sort_iter_end(struct sort_iter *iter)
-{
-	return !iter->used;
-}
-
-static inline void sort_iter_sift(struct sort_iter *iter, unsigned from,
-				  sort_cmp_fn cmp)
-{
-	unsigned i;
-
-	for (i = from;
-	     i + 1 < iter->used &&
-	     cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
-	     i++)
-		swap(iter->data[i], iter->data[i + 1]);
-}
-
-static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
-{
-	unsigned i = iter->used;
-
-	while (i--)
-		sort_iter_sift(iter, i, cmp);
-}
-
-static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
-{
-	return !sort_iter_end(iter) ? iter->data->k : NULL;
-}
-
-static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
-{
-	struct sort_iter_set *i = iter->data;
-
-	BUG_ON(!iter->used);
-
-	i->k = bkey_p_next(i->k);
-
-	BUG_ON(i->k > i->end);
-
-	if (i->k == i->end)
-		array_remove_item(iter->data, iter->used, 0);
-	else
-		sort_iter_sift(iter, 0, cmp);
-}
-
-static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
-						 sort_cmp_fn cmp)
-{
-	struct bkey_packed *ret = sort_iter_peek(iter);
-
-	if (ret)
-		sort_iter_advance(iter, cmp);
-
-	return ret;
-}
-
-/*
- * If keys compare equal, compare by pointer order:
- */
-static inline int key_sort_fix_overlapping_cmp(const struct btree *b,
-					       const struct bkey_packed *l,
-					       const struct bkey_packed *r)
-{
-	return bch2_bkey_cmp_packed(b, l, r) ?:
-		cmp_int((unsigned long) l, (unsigned long) r);
-}
-
-static inline bool should_drop_next_key(struct sort_iter *iter)
-{
-	/*
-	 * key_sort_cmp() ensures that when keys compare equal the older key
-	 * comes first; so if l->k compares equal to r->k then l->k is older
-	 * and should be dropped.
-	 */
-	return iter->used >= 2 &&
-		!bch2_bkey_cmp_packed(iter->b,
-				 iter->data[0].k,
-				 iter->data[1].k);
-}
-
-struct btree_nr_keys
-bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
-			      struct sort_iter *iter)
-{
-	struct bkey_packed *out = dst->start;
-	struct bkey_packed *k;
-	struct btree_nr_keys nr;
-
-	memset(&nr, 0, sizeof(nr));
-
-	sort_iter_sort(iter, key_sort_fix_overlapping_cmp);
-
-	while ((k = sort_iter_peek(iter))) {
-		if (!bkey_deleted(k) &&
-		    !should_drop_next_key(iter)) {
-			bkey_p_copy(out, k);
-			btree_keys_account_key_add(&nr, 0, out);
-			out = bkey_p_next(out);
-		}
-
-		sort_iter_advance(iter, key_sort_fix_overlapping_cmp);
-	}
-
-	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-	return nr;
-}
-
-/* Sort + repack in a new format: */
-struct btree_nr_keys
-bch2_sort_repack(struct bset *dst, struct btree *src,
-		 struct btree_node_iter *src_iter,
-		 struct bkey_format *out_f,
-		 bool filter_whiteouts)
-{
-	struct bkey_format *in_f = &src->format;
-	struct bkey_packed *in, *out = vstruct_last(dst);
-	struct btree_nr_keys nr;
-	bool transform = memcmp(out_f, &src->format, sizeof(*out_f));
-
-	memset(&nr, 0, sizeof(nr));
-
-	while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
-		if (filter_whiteouts && bkey_deleted(in))
-			continue;
-
-		if (!transform)
-			bkey_p_copy(out, in);
-		else if (bch2_bkey_transform(out_f, out, bkey_packed(in)
-					     ? in_f : &bch2_bkey_format_current, in))
-			out->format = KEY_FORMAT_LOCAL_BTREE;
-		else
-			bch2_bkey_unpack(src, (void *) out, in);
-
-		out->needs_whiteout = false;
-
-		btree_keys_account_key_add(&nr, 0, out);
-		out = bkey_p_next(out);
-	}
-
-	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-	return nr;
-}
-
-static inline int keep_unwritten_whiteouts_cmp(const struct btree *b,
-				const struct bkey_packed *l,
-				const struct bkey_packed *r)
-{
-	return bch2_bkey_cmp_packed_inlined(b, l, r) ?:
-		(int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
-		(long) l - (long) r;
-}
-
-#include "btree_update_interior.h"
-
-/*
- * For sorting in the btree node write path: whiteouts not in the unwritten
- * whiteouts area are dropped, whiteouts in the unwritten whiteouts area are
- * dropped if overwritten by real keys:
- */
-unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *dst, struct sort_iter *iter)
-{
-	struct bkey_packed *in, *next, *out = dst;
-
-	sort_iter_sort(iter, keep_unwritten_whiteouts_cmp);
-
-	while ((in = sort_iter_next(iter, keep_unwritten_whiteouts_cmp))) {
-		if (bkey_deleted(in) && in < unwritten_whiteouts_start(iter->b))
-			continue;
-
-		if ((next = sort_iter_peek(iter)) &&
-		    !bch2_bkey_cmp_packed_inlined(iter->b, in, next))
-			continue;
-
-		bkey_p_copy(out, in);
-		out = bkey_p_next(out);
-	}
-
-	return (u64 *) out - (u64 *) dst;
-}
-
-/*
- * Main sort routine for compacting a btree node in memory: we always drop
- * whiteouts because any whiteouts that need to be written are in the unwritten
- * whiteouts area:
- */
-unsigned bch2_sort_keys(struct bkey_packed *dst, struct sort_iter *iter)
-{
-	struct bkey_packed *in, *out = dst;
-
-	sort_iter_sort(iter, bch2_bkey_cmp_packed_inlined);
-
-	while ((in = sort_iter_next(iter, bch2_bkey_cmp_packed_inlined))) {
-		if (bkey_deleted(in))
-			continue;
-
-		bkey_p_copy(out, in);
-		out = bkey_p_next(out);
-	}
-
-	return (u64 *) out - (u64 *) dst;
-}
diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h
deleted file mode 100644
index 9be969d46890..000000000000
--- a/fs/bcachefs/bkey_sort.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_SORT_H
-#define _BCACHEFS_BKEY_SORT_H
-
-struct sort_iter {
-	struct btree		*b;
-	unsigned		used;
-	unsigned		size;
-
-	struct sort_iter_set {
-		struct bkey_packed *k, *end;
-	} data[];
-};
-
-static inline void sort_iter_init(struct sort_iter *iter, struct btree *b, unsigned size)
-{
-	iter->b = b;
-	iter->used = 0;
-	iter->size = size;
-}
-
-struct sort_iter_stack {
-	struct sort_iter	iter;
-	struct sort_iter_set	sets[MAX_BSETS + 1];
-};
-
-static inline void sort_iter_stack_init(struct sort_iter_stack *iter, struct btree *b)
-{
-	sort_iter_init(&iter->iter, b, ARRAY_SIZE(iter->sets));
-}
-
-static inline void sort_iter_add(struct sort_iter *iter,
-				 struct bkey_packed *k,
-				 struct bkey_packed *end)
-{
-	BUG_ON(iter->used >= iter->size);
-
-	if (k != end)
-		iter->data[iter->used++] = (struct sort_iter_set) { k, end };
-}
-
-struct btree_nr_keys
-bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *,
-			      struct sort_iter *);
-
-struct btree_nr_keys
-bch2_sort_repack(struct bset *, struct btree *,
-		 struct btree_node_iter *,
-		 struct bkey_format *, bool);
-
-unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *, struct sort_iter *);
-unsigned bch2_sort_keys(struct bkey_packed *, struct sort_iter *);
-
-#endif /* _BCACHEFS_BKEY_SORT_H */
diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h
deleted file mode 100644
index c9ae9e42b385..000000000000
--- a/fs/bcachefs/bkey_types.h
+++ /dev/null
@@ -1,213 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_TYPES_H
-#define _BCACHEFS_BKEY_TYPES_H
-
-#include "bcachefs_format.h"
-
-/*
- * bkey_i	- bkey with inline value
- * bkey_s	- bkey with split value
- * bkey_s_c	- bkey with split value, const
- */
-
-#define bkey_p_next(_k)		vstruct_next(_k)
-
-static inline struct bkey_i *bkey_next(struct bkey_i *k)
-{
-	return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
-}
-
-#define bkey_val_u64s(_k)	((_k)->u64s - BKEY_U64s)
-
-static inline size_t bkey_val_bytes(const struct bkey *k)
-{
-	return bkey_val_u64s(k) * sizeof(u64);
-}
-
-static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
-{
-	unsigned u64s = BKEY_U64s + val_u64s;
-
-	BUG_ON(u64s > U8_MAX);
-	k->u64s = u64s;
-}
-
-static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
-{
-	set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
-}
-
-#define bkey_val_end(_k)	((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
-
-#define bkey_deleted(_k)	((_k)->type == KEY_TYPE_deleted)
-
-#define bkey_whiteout(_k)				\
-	((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
-
-/* bkey with split value, const */
-struct bkey_s_c {
-	const struct bkey	*k;
-	const struct bch_val	*v;
-};
-
-/* bkey with split value */
-struct bkey_s {
-	union {
-	struct {
-		struct bkey	*k;
-		struct bch_val	*v;
-	};
-	struct bkey_s_c		s_c;
-	};
-};
-
-#define bkey_s_null		((struct bkey_s)   { .k = NULL })
-#define bkey_s_c_null		((struct bkey_s_c) { .k = NULL })
-
-#define bkey_s_err(err)		((struct bkey_s)   { .k = ERR_PTR(err) })
-#define bkey_s_c_err(err)	((struct bkey_s_c) { .k = ERR_PTR(err) })
-
-static inline struct bkey_s bkey_to_s(struct bkey *k)
-{
-	return (struct bkey_s) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
-{
-	return (struct bkey_s_c) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
-{
-	return (struct bkey_s) { .k = &k->k, .v = &k->v };
-}
-
-static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
-{
-	return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
-}
-
-/*
- * For a given type of value (e.g. struct bch_extent), generates the types for
- * bkey + bch_extent - inline, split, split const - and also all the conversion
- * functions, which also check that the value is of the correct type.
- *
- * We use anonymous unions for upcasting - e.g. converting from e.g. a
- * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
- * functions.
- */
-#define x(name, ...)					\
-struct bkey_i_##name {							\
-	union {								\
-		struct bkey		k;				\
-		struct bkey_i		k_i;				\
-	};								\
-	struct bch_##name		v;				\
-};									\
-									\
-struct bkey_s_c_##name {						\
-	union {								\
-	struct {							\
-		const struct bkey	*k;				\
-		const struct bch_##name	*v;				\
-	};								\
-	struct bkey_s_c			s_c;				\
-	};								\
-};									\
-									\
-struct bkey_s_##name {							\
-	union {								\
-	struct {							\
-		struct bkey		*k;				\
-		struct bch_##name	*v;				\
-	};								\
-	struct bkey_s_c_##name		c;				\
-	struct bkey_s			s;				\
-	struct bkey_s_c			s_c;				\
-	};								\
-};									\
-									\
-static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k)	\
-{									\
-	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
-	return container_of(&k->k, struct bkey_i_##name, k);		\
-}									\
-									\
-static inline const struct bkey_i_##name *				\
-bkey_i_to_##name##_c(const struct bkey_i *k)				\
-{									\
-	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
-	return container_of(&k->k, struct bkey_i_##name, k);		\
-}									\
-									\
-static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k)	\
-{									\
-	EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name);	\
-	return (struct bkey_s_##name) {					\
-		.k = k.k,						\
-		.v = container_of(k.v, struct bch_##name, v),		\
-	};								\
-}									\
-									\
-static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
-{									\
-	EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name);	\
-	return (struct bkey_s_c_##name) {				\
-		.k = k.k,						\
-		.v = container_of(k.v, struct bch_##name, v),		\
-	};								\
-}									\
-									\
-static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
-{									\
-	return (struct bkey_s_##name) {					\
-		.k = &k->k,						\
-		.v = &k->v,						\
-	};								\
-}									\
-									\
-static inline struct bkey_s_c_##name					\
-name##_i_to_s_c(const struct bkey_i_##name *k)				\
-{									\
-	return (struct bkey_s_c_##name) {				\
-		.k = &k->k,						\
-		.v = &k->v,						\
-	};								\
-}									\
-									\
-static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k)	\
-{									\
-	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
-	return (struct bkey_s_##name) {					\
-		.k = &k->k,						\
-		.v = container_of(&k->v, struct bch_##name, v),		\
-	};								\
-}									\
-									\
-static inline struct bkey_s_c_##name					\
-bkey_i_to_s_c_##name(const struct bkey_i *k)				\
-{									\
-	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
-	return (struct bkey_s_c_##name) {				\
-		.k = &k->k,						\
-		.v = container_of(&k->v, struct bch_##name, v),		\
-	};								\
-}									\
-									\
-static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
-{									\
-	struct bkey_i_##name *k =					\
-		container_of(&_k->k, struct bkey_i_##name, k);		\
-									\
-	bkey_init(&k->k);						\
-	memset(&k->v, 0, sizeof(k->v));					\
-	k->k.type = KEY_TYPE_##name;					\
-	set_bkey_val_bytes(&k->k, sizeof(k->v));			\
-									\
-	return k;							\
-}
-
-BCH_BKEY_TYPES();
-#undef x
-
-#endif /* _BCACHEFS_BKEY_TYPES_H */
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
deleted file mode 100644
index 575e1d0b6eeb..000000000000
--- a/fs/bcachefs/bset.c
+++ /dev/null
@@ -1,1594 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Code for working with individual keys, and sorted sets of keys with in a
- * btree node
- *
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "bset.h"
-#include "eytzinger.h"
-#include "trace.h"
-#include "util.h"
-
-#include <asm/unaligned.h>
-#include <linux/console.h>
-#include <linux/random.h>
-#include <linux/prefetch.h>
-
-static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *,
-						  struct btree *);
-
-static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
-{
-	unsigned n = ARRAY_SIZE(iter->data);
-
-	while (n && __btree_node_iter_set_end(iter, n - 1))
-		--n;
-
-	return n;
-}
-
-struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
-{
-	return bch2_bkey_to_bset_inlined(b, k);
-}
-
-/*
- * There are never duplicate live keys in the btree - but including keys that
- * have been flagged as deleted (and will be cleaned up later) we _will_ see
- * duplicates.
- *
- * Thus the sort order is: usual key comparison first, but for keys that compare
- * equal the deleted key(s) come first, and the (at most one) live version comes
- * last.
- *
- * The main reason for this is insertion: to handle overwrites, we first iterate
- * over keys that compare equal to our insert key, and then insert immediately
- * prior to the first key greater than the key we're inserting - our insert
- * position will be after all keys that compare equal to our insert key, which
- * by the time we actually do the insert will all be deleted.
- */
-
-void bch2_dump_bset(struct bch_fs *c, struct btree *b,
-		    struct bset *i, unsigned set)
-{
-	struct bkey_packed *_k, *_n;
-	struct bkey uk, n;
-	struct bkey_s_c k;
-	struct printbuf buf = PRINTBUF;
-
-	if (!i->u64s)
-		return;
-
-	for (_k = i->start;
-	     _k < vstruct_last(i);
-	     _k = _n) {
-		_n = bkey_p_next(_k);
-
-		if (!_k->u64s) {
-			printk(KERN_ERR "block %u key %5zu - u64s 0? aieee!\n", set,
-			       _k->_data - i->_data);
-			break;
-		}
-
-		k = bkey_disassemble(b, _k, &uk);
-
-		printbuf_reset(&buf);
-		if (c)
-			bch2_bkey_val_to_text(&buf, c, k);
-		else
-			bch2_bkey_to_text(&buf, k.k);
-		printk(KERN_ERR "block %u key %5zu: %s\n", set,
-		       _k->_data - i->_data, buf.buf);
-
-		if (_n == vstruct_last(i))
-			continue;
-
-		n = bkey_unpack_key(b, _n);
-
-		if (bpos_lt(n.p, k.k->p)) {
-			printk(KERN_ERR "Key skipped backwards\n");
-			continue;
-		}
-
-		if (!bkey_deleted(k.k) && bpos_eq(n.p, k.k->p))
-			printk(KERN_ERR "Duplicate keys\n");
-	}
-
-	printbuf_exit(&buf);
-}
-
-void bch2_dump_btree_node(struct bch_fs *c, struct btree *b)
-{
-	console_lock();
-	for_each_bset(b, t)
-		bch2_dump_bset(c, b, bset(b, t), t - b->set);
-	console_unlock();
-}
-
-void bch2_dump_btree_node_iter(struct btree *b,
-			      struct btree_node_iter *iter)
-{
-	struct btree_node_iter_set *set;
-	struct printbuf buf = PRINTBUF;
-
-	printk(KERN_ERR "btree node iter with %u/%u sets:\n",
-	       __btree_node_iter_used(iter), b->nsets);
-
-	btree_node_iter_for_each(iter, set) {
-		struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
-		struct bset_tree *t = bch2_bkey_to_bset(b, k);
-		struct bkey uk = bkey_unpack_key(b, k);
-
-		printbuf_reset(&buf);
-		bch2_bkey_to_text(&buf, &uk);
-		printk(KERN_ERR "set %zu key %u: %s\n",
-		       t - b->set, set->k, buf.buf);
-	}
-
-	printbuf_exit(&buf);
-}
-
-struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b)
-{
-	struct bkey_packed *k;
-	struct btree_nr_keys nr = {};
-
-	for_each_bset(b, t)
-		bset_tree_for_each_key(b, t, k)
-			if (!bkey_deleted(k))
-				btree_keys_account_key_add(&nr, t - b->set, k);
-	return nr;
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-void __bch2_verify_btree_nr_keys(struct btree *b)
-{
-	struct btree_nr_keys nr = bch2_btree_node_count_keys(b);
-
-	BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
-}
-
-static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
-					    struct btree *b)
-{
-	struct btree_node_iter iter = *_iter;
-	const struct bkey_packed *k, *n;
-
-	k = bch2_btree_node_iter_peek_all(&iter, b);
-	__bch2_btree_node_iter_advance(&iter, b);
-	n = bch2_btree_node_iter_peek_all(&iter, b);
-
-	bkey_unpack_key(b, k);
-
-	if (n &&
-	    bkey_iter_cmp(b, k, n) > 0) {
-		struct btree_node_iter_set *set;
-		struct bkey ku = bkey_unpack_key(b, k);
-		struct bkey nu = bkey_unpack_key(b, n);
-		struct printbuf buf1 = PRINTBUF;
-		struct printbuf buf2 = PRINTBUF;
-
-		bch2_dump_btree_node(NULL, b);
-		bch2_bkey_to_text(&buf1, &ku);
-		bch2_bkey_to_text(&buf2, &nu);
-		printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
-		       buf1.buf, buf2.buf);
-		printk(KERN_ERR "iter was:");
-
-		btree_node_iter_for_each(_iter, set) {
-			struct bkey_packed *k2 = __btree_node_offset_to_key(b, set->k);
-			struct bset_tree *t = bch2_bkey_to_bset(b, k2);
-			printk(" [%zi %zi]", t - b->set,
-			       k2->_data - bset(b, t)->_data);
-		}
-		panic("\n");
-	}
-}
-
-void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
-				 struct btree *b)
-{
-	struct btree_node_iter_set *set, *s2;
-	struct bkey_packed *k, *p;
-
-	if (bch2_btree_node_iter_end(iter))
-		return;
-
-	/* Verify no duplicates: */
-	btree_node_iter_for_each(iter, set) {
-		BUG_ON(set->k > set->end);
-		btree_node_iter_for_each(iter, s2)
-			BUG_ON(set != s2 && set->end == s2->end);
-	}
-
-	/* Verify that set->end is correct: */
-	btree_node_iter_for_each(iter, set) {
-		for_each_bset(b, t)
-			if (set->end == t->end_offset) {
-				BUG_ON(set->k < btree_bkey_first_offset(t) ||
-				       set->k >= t->end_offset);
-				goto found;
-			}
-		BUG();
-found:
-		do {} while (0);
-	}
-
-	/* Verify iterator is sorted: */
-	btree_node_iter_for_each(iter, set)
-		BUG_ON(set != iter->data &&
-		       btree_node_iter_cmp(b, set[-1], set[0]) > 0);
-
-	k = bch2_btree_node_iter_peek_all(iter, b);
-
-	for_each_bset(b, t) {
-		if (iter->data[0].end == t->end_offset)
-			continue;
-
-		p = bch2_bkey_prev_all(b, t,
-			bch2_btree_node_iter_bset_pos(iter, b, t));
-
-		BUG_ON(p && bkey_iter_cmp(b, k, p) < 0);
-	}
-}
-
-void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
-			    struct bkey_packed *insert, unsigned clobber_u64s)
-{
-	struct bset_tree *t = bch2_bkey_to_bset(b, where);
-	struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
-	struct bkey_packed *next = (void *) ((u64 *) where->_data + clobber_u64s);
-	struct printbuf buf1 = PRINTBUF;
-	struct printbuf buf2 = PRINTBUF;
-#if 0
-	BUG_ON(prev &&
-	       bkey_iter_cmp(b, prev, insert) > 0);
-#else
-	if (prev &&
-	    bkey_iter_cmp(b, prev, insert) > 0) {
-		struct bkey k1 = bkey_unpack_key(b, prev);
-		struct bkey k2 = bkey_unpack_key(b, insert);
-
-		bch2_dump_btree_node(NULL, b);
-		bch2_bkey_to_text(&buf1, &k1);
-		bch2_bkey_to_text(&buf2, &k2);
-
-		panic("prev > insert:\n"
-		      "prev    key %s\n"
-		      "insert  key %s\n",
-		      buf1.buf, buf2.buf);
-	}
-#endif
-#if 0
-	BUG_ON(next != btree_bkey_last(b, t) &&
-	       bkey_iter_cmp(b, insert, next) > 0);
-#else
-	if (next != btree_bkey_last(b, t) &&
-	    bkey_iter_cmp(b, insert, next) > 0) {
-		struct bkey k1 = bkey_unpack_key(b, insert);
-		struct bkey k2 = bkey_unpack_key(b, next);
-
-		bch2_dump_btree_node(NULL, b);
-		bch2_bkey_to_text(&buf1, &k1);
-		bch2_bkey_to_text(&buf2, &k2);
-
-		panic("insert > next:\n"
-		      "insert  key %s\n"
-		      "next    key %s\n",
-		      buf1.buf, buf2.buf);
-	}
-#endif
-}
-
-#else
-
-static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
-						   struct btree *b) {}
-
-#endif
-
-/* Auxiliary search trees */
-
-#define BFLOAT_FAILED_UNPACKED	U8_MAX
-#define BFLOAT_FAILED		U8_MAX
-
-struct bkey_float {
-	u8		exponent;
-	u8		key_offset;
-	u16		mantissa;
-};
-#define BKEY_MANTISSA_BITS	16
-
-static unsigned bkey_float_byte_offset(unsigned idx)
-{
-	return idx * sizeof(struct bkey_float);
-}
-
-struct ro_aux_tree {
-	u8			nothing[0];
-	struct bkey_float	f[];
-};
-
-struct rw_aux_tree {
-	u16		offset;
-	struct bpos	k;
-};
-
-static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
-{
-	BUG_ON(t->aux_data_offset == U16_MAX);
-
-	switch (bset_aux_tree_type(t)) {
-	case BSET_NO_AUX_TREE:
-		return t->aux_data_offset;
-	case BSET_RO_AUX_TREE:
-		return t->aux_data_offset +
-			DIV_ROUND_UP(t->size * sizeof(struct bkey_float) +
-				     t->size * sizeof(u8), 8);
-	case BSET_RW_AUX_TREE:
-		return t->aux_data_offset +
-			DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
-	default:
-		BUG();
-	}
-}
-
-static unsigned bset_aux_tree_buf_start(const struct btree *b,
-					const struct bset_tree *t)
-{
-	return t == b->set
-		? DIV_ROUND_UP(b->unpack_fn_len, 8)
-		: bset_aux_tree_buf_end(t - 1);
-}
-
-static void *__aux_tree_base(const struct btree *b,
-			     const struct bset_tree *t)
-{
-	return b->aux_data + t->aux_data_offset * 8;
-}
-
-static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
-					    const struct bset_tree *t)
-{
-	EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
-
-	return __aux_tree_base(b, t);
-}
-
-static u8 *ro_aux_tree_prev(const struct btree *b,
-			    const struct bset_tree *t)
-{
-	EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
-
-	return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
-}
-
-static struct bkey_float *bkey_float(const struct btree *b,
-				     const struct bset_tree *t,
-				     unsigned idx)
-{
-	return ro_aux_tree_base(b, t)->f + idx;
-}
-
-static void bset_aux_tree_verify(struct btree *b)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-	for_each_bset(b, t) {
-		if (t->aux_data_offset == U16_MAX)
-			continue;
-
-		BUG_ON(t != b->set &&
-		       t[-1].aux_data_offset == U16_MAX);
-
-		BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t));
-		BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
-		BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
-	}
-#endif
-}
-
-void bch2_btree_keys_init(struct btree *b)
-{
-	unsigned i;
-
-	b->nsets		= 0;
-	memset(&b->nr, 0, sizeof(b->nr));
-
-	for (i = 0; i < MAX_BSETS; i++)
-		b->set[i].data_offset = U16_MAX;
-
-	bch2_bset_set_no_aux_tree(b, b->set);
-}
-
-/* Binary tree stuff for auxiliary search trees */
-
-/*
- * Cacheline/offset <-> bkey pointer arithmetic:
- *
- * t->tree is a binary search tree in an array; each node corresponds to a key
- * in one cacheline in t->set (BSET_CACHELINE bytes).
- *
- * This means we don't have to store the full index of the key that a node in
- * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and
- * then bkey_float->m gives us the offset within that cacheline, in units of 8
- * bytes.
- *
- * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
- * make this work.
- *
- * To construct the bfloat for an arbitrary key we need to know what the key
- * immediately preceding it is: we have to check if the two keys differ in the
- * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
- * of the previous key so we can walk backwards to it from t->tree[j]'s key.
- */
-
-static inline void *bset_cacheline(const struct btree *b,
-				   const struct bset_tree *t,
-				   unsigned cacheline)
-{
-	return (void *) round_down((unsigned long) btree_bkey_first(b, t),
-				   L1_CACHE_BYTES) +
-		cacheline * BSET_CACHELINE;
-}
-
-static struct bkey_packed *cacheline_to_bkey(const struct btree *b,
-					     const struct bset_tree *t,
-					     unsigned cacheline,
-					     unsigned offset)
-{
-	return bset_cacheline(b, t, cacheline) + offset * 8;
-}
-
-static unsigned bkey_to_cacheline(const struct btree *b,
-				  const struct bset_tree *t,
-				  const struct bkey_packed *k)
-{
-	return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE;
-}
-
-static ssize_t __bkey_to_cacheline_offset(const struct btree *b,
-					  const struct bset_tree *t,
-					  unsigned cacheline,
-					  const struct bkey_packed *k)
-{
-	return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline);
-}
-
-static unsigned bkey_to_cacheline_offset(const struct btree *b,
-					 const struct bset_tree *t,
-					 unsigned cacheline,
-					 const struct bkey_packed *k)
-{
-	size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k);
-
-	EBUG_ON(m > U8_MAX);
-	return m;
-}
-
-static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
-					       const struct bset_tree *t,
-					       unsigned j)
-{
-	return cacheline_to_bkey(b, t,
-			__eytzinger1_to_inorder(j, t->size - 1, t->extra),
-			bkey_float(b, t, j)->key_offset);
-}
-
-static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
-					     const struct bset_tree *t,
-					     unsigned j)
-{
-	unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
-
-	return (void *) ((u64 *) tree_to_bkey(b, t, j)->_data - prev_u64s);
-}
-
-static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
-				       const struct bset_tree *t)
-{
-	EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
-
-	return __aux_tree_base(b, t);
-}
-
-/*
- * For the write set - the one we're currently inserting keys into - we don't
- * maintain a full search tree, we just keep a simple lookup table in t->prev.
- */
-static struct bkey_packed *rw_aux_to_bkey(const struct btree *b,
-					  struct bset_tree *t,
-					  unsigned j)
-{
-	return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset);
-}
-
-static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
-			    unsigned j, struct bkey_packed *k)
-{
-	EBUG_ON(k >= btree_bkey_last(b, t));
-
-	rw_aux_tree(b, t)[j] = (struct rw_aux_tree) {
-		.offset	= __btree_node_key_to_offset(b, k),
-		.k	= bkey_unpack_pos(b, k),
-	};
-}
-
-static void bch2_bset_verify_rw_aux_tree(struct btree *b,
-					struct bset_tree *t)
-{
-	struct bkey_packed *k = btree_bkey_first(b, t);
-	unsigned j = 0;
-
-	if (!bch2_expensive_debug_checks)
-		return;
-
-	BUG_ON(bset_has_ro_aux_tree(t));
-
-	if (!bset_has_rw_aux_tree(t))
-		return;
-
-	BUG_ON(t->size < 1);
-	BUG_ON(rw_aux_to_bkey(b, t, j) != k);
-
-	goto start;
-	while (1) {
-		if (rw_aux_to_bkey(b, t, j) == k) {
-			BUG_ON(!bpos_eq(rw_aux_tree(b, t)[j].k,
-					bkey_unpack_pos(b, k)));
-start:
-			if (++j == t->size)
-				break;
-
-			BUG_ON(rw_aux_tree(b, t)[j].offset <=
-			       rw_aux_tree(b, t)[j - 1].offset);
-		}
-
-		k = bkey_p_next(k);
-		BUG_ON(k >= btree_bkey_last(b, t));
-	}
-}
-
-/* returns idx of first entry >= offset: */
-static unsigned rw_aux_tree_bsearch(struct btree *b,
-				    struct bset_tree *t,
-				    unsigned offset)
-{
-	unsigned bset_offs = offset - btree_bkey_first_offset(t);
-	unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t);
-	unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0;
-
-	EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
-	EBUG_ON(!t->size);
-	EBUG_ON(idx > t->size);
-
-	while (idx < t->size &&
-	       rw_aux_tree(b, t)[idx].offset < offset)
-		idx++;
-
-	while (idx &&
-	       rw_aux_tree(b, t)[idx - 1].offset >= offset)
-		idx--;
-
-	EBUG_ON(idx < t->size &&
-		rw_aux_tree(b, t)[idx].offset < offset);
-	EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset);
-	EBUG_ON(idx + 1 < t->size &&
-		rw_aux_tree(b, t)[idx].offset ==
-		rw_aux_tree(b, t)[idx + 1].offset);
-
-	return idx;
-}
-
-static inline unsigned bkey_mantissa(const struct bkey_packed *k,
-				     const struct bkey_float *f,
-				     unsigned idx)
-{
-	u64 v;
-
-	EBUG_ON(!bkey_packed(k));
-
-	v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3)));
-
-	/*
-	 * In little endian, we're shifting off low bits (and then the bits we
-	 * want are at the low end), in big endian we're shifting off high bits
-	 * (and then the bits we want are at the high end, so we shift them
-	 * back down):
-	 */
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-	v >>= f->exponent & 7;
-#else
-	v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS;
-#endif
-	return (u16) v;
-}
-
-static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
-					unsigned j,
-					struct bkey_packed *min_key,
-					struct bkey_packed *max_key)
-{
-	struct bkey_float *f = bkey_float(b, t, j);
-	struct bkey_packed *m = tree_to_bkey(b, t, j);
-	struct bkey_packed *l = is_power_of_2(j)
-		? min_key
-		: tree_to_prev_bkey(b, t, j >> ffs(j));
-	struct bkey_packed *r = is_power_of_2(j + 1)
-		? max_key
-		: tree_to_bkey(b, t, j >> (ffz(j) + 1));
-	unsigned mantissa;
-	int shift, exponent, high_bit;
-
-	/*
-	 * for failed bfloats, the lookup code falls back to comparing against
-	 * the original key.
-	 */
-
-	if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) ||
-	    !b->nr_key_bits) {
-		f->exponent = BFLOAT_FAILED_UNPACKED;
-		return;
-	}
-
-	/*
-	 * The greatest differing bit of l and r is the first bit we must
-	 * include in the bfloat mantissa we're creating in order to do
-	 * comparisons - that bit always becomes the high bit of
-	 * bfloat->mantissa, and thus the exponent we're calculating here is
-	 * the position of what will become the low bit in bfloat->mantissa:
-	 *
-	 * Note that this may be negative - we may be running off the low end
-	 * of the key: we handle this later:
-	 */
-	high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r),
-		       min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1);
-	exponent = high_bit - (BKEY_MANTISSA_BITS - 1);
-
-	/*
-	 * Then we calculate the actual shift value, from the start of the key
-	 * (k->_data), to get the key bits starting at exponent:
-	 */
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-	shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
-
-	EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64);
-#else
-	shift = high_bit_offset +
-		b->nr_key_bits -
-		exponent -
-		BKEY_MANTISSA_BITS;
-
-	EBUG_ON(shift < KEY_PACKED_BITS_START);
-#endif
-	EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
-
-	f->exponent = shift;
-	mantissa = bkey_mantissa(m, f, j);
-
-	/*
-	 * If we've got garbage bits, set them to all 1s - it's legal for the
-	 * bfloat to compare larger than the original key, but not smaller:
-	 */
-	if (exponent < 0)
-		mantissa |= ~(~0U << -exponent);
-
-	f->mantissa = mantissa;
-}
-
-/* bytes remaining - only valid for last bset: */
-static unsigned __bset_tree_capacity(struct btree *b, const struct bset_tree *t)
-{
-	bset_aux_tree_verify(b);
-
-	return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
-}
-
-static unsigned bset_ro_tree_capacity(struct btree *b, const struct bset_tree *t)
-{
-	return __bset_tree_capacity(b, t) /
-		(sizeof(struct bkey_float) + sizeof(u8));
-}
-
-static unsigned bset_rw_tree_capacity(struct btree *b, const struct bset_tree *t)
-{
-	return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
-}
-
-static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
-{
-	struct bkey_packed *k;
-
-	t->size = 1;
-	t->extra = BSET_RW_AUX_TREE_VAL;
-	rw_aux_tree(b, t)[0].offset =
-		__btree_node_key_to_offset(b, btree_bkey_first(b, t));
-
-	bset_tree_for_each_key(b, t, k) {
-		if (t->size == bset_rw_tree_capacity(b, t))
-			break;
-
-		if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) >
-		    L1_CACHE_BYTES)
-			rw_aux_tree_set(b, t, t->size++, k);
-	}
-}
-
-static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
-{
-	struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
-	struct bkey_i min_key, max_key;
-	unsigned cacheline = 1;
-
-	t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
-		      bset_ro_tree_capacity(b, t));
-retry:
-	if (t->size < 2) {
-		t->size = 0;
-		t->extra = BSET_NO_AUX_TREE_VAL;
-		return;
-	}
-
-	t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
-
-	/* First we figure out where the first key in each cacheline is */
-	eytzinger1_for_each(j, t->size - 1) {
-		while (bkey_to_cacheline(b, t, k) < cacheline)
-			prev = k, k = bkey_p_next(k);
-
-		if (k >= btree_bkey_last(b, t)) {
-			/* XXX: this path sucks */
-			t->size--;
-			goto retry;
-		}
-
-		ro_aux_tree_prev(b, t)[j] = prev->u64s;
-		bkey_float(b, t, j)->key_offset =
-			bkey_to_cacheline_offset(b, t, cacheline++, k);
-
-		EBUG_ON(tree_to_prev_bkey(b, t, j) != prev);
-		EBUG_ON(tree_to_bkey(b, t, j) != k);
-	}
-
-	while (k != btree_bkey_last(b, t))
-		prev = k, k = bkey_p_next(k);
-
-	if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
-		bkey_init(&min_key.k);
-		min_key.k.p = b->data->min_key;
-	}
-
-	if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) {
-		bkey_init(&max_key.k);
-		max_key.k.p = b->data->max_key;
-	}
-
-	/* Then we build the tree */
-	eytzinger1_for_each(j, t->size - 1)
-		make_bfloat(b, t, j,
-			    bkey_to_packed(&min_key),
-			    bkey_to_packed(&max_key));
-}
-
-static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
-{
-	struct bset_tree *i;
-
-	for (i = b->set; i != t; i++)
-		BUG_ON(bset_has_rw_aux_tree(i));
-
-	bch2_bset_set_no_aux_tree(b, t);
-
-	/* round up to next cacheline: */
-	t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t),
-				      SMP_CACHE_BYTES / sizeof(u64));
-
-	bset_aux_tree_verify(b);
-}
-
-void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t,
-			     bool writeable)
-{
-	if (writeable
-	    ? bset_has_rw_aux_tree(t)
-	    : bset_has_ro_aux_tree(t))
-		return;
-
-	bset_alloc_tree(b, t);
-
-	if (!__bset_tree_capacity(b, t))
-		return;
-
-	if (writeable)
-		__build_rw_aux_tree(b, t);
-	else
-		__build_ro_aux_tree(b, t);
-
-	bset_aux_tree_verify(b);
-}
-
-void bch2_bset_init_first(struct btree *b, struct bset *i)
-{
-	struct bset_tree *t;
-
-	BUG_ON(b->nsets);
-
-	memset(i, 0, sizeof(*i));
-	get_random_bytes(&i->seq, sizeof(i->seq));
-	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
-
-	t = &b->set[b->nsets++];
-	set_btree_bset(b, t, i);
-}
-
-void bch2_bset_init_next(struct btree *b, struct btree_node_entry *bne)
-{
-	struct bset *i = &bne->keys;
-	struct bset_tree *t;
-
-	BUG_ON(bset_byte_offset(b, bne) >= btree_buf_bytes(b));
-	BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
-	BUG_ON(b->nsets >= MAX_BSETS);
-
-	memset(i, 0, sizeof(*i));
-	i->seq = btree_bset_first(b)->seq;
-	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
-
-	t = &b->set[b->nsets++];
-	set_btree_bset(b, t, i);
-}
-
-/*
- * find _some_ key in the same bset as @k that precedes @k - not necessarily the
- * immediate predecessor:
- */
-static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
-				       struct bkey_packed *k)
-{
-	struct bkey_packed *p;
-	unsigned offset;
-	int j;
-
-	EBUG_ON(k < btree_bkey_first(b, t) ||
-		k > btree_bkey_last(b, t));
-
-	if (k == btree_bkey_first(b, t))
-		return NULL;
-
-	switch (bset_aux_tree_type(t)) {
-	case BSET_NO_AUX_TREE:
-		p = btree_bkey_first(b, t);
-		break;
-	case BSET_RO_AUX_TREE:
-		j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k));
-
-		do {
-			p = j ? tree_to_bkey(b, t,
-					__inorder_to_eytzinger1(j--,
-							t->size - 1, t->extra))
-			      : btree_bkey_first(b, t);
-		} while (p >= k);
-		break;
-	case BSET_RW_AUX_TREE:
-		offset = __btree_node_key_to_offset(b, k);
-		j = rw_aux_tree_bsearch(b, t, offset);
-		p = j ? rw_aux_to_bkey(b, t, j - 1)
-		      : btree_bkey_first(b, t);
-		break;
-	}
-
-	return p;
-}
-
-struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
-					  struct bset_tree *t,
-					  struct bkey_packed *k,
-					  unsigned min_key_type)
-{
-	struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
-
-	while ((p = __bkey_prev(b, t, k)) && !ret) {
-		for (i = p; i != k; i = bkey_p_next(i))
-			if (i->type >= min_key_type)
-				ret = i;
-
-		k = p;
-	}
-
-	if (bch2_expensive_debug_checks) {
-		BUG_ON(ret >= orig_k);
-
-		for (i = ret
-			? bkey_p_next(ret)
-			: btree_bkey_first(b, t);
-		     i != orig_k;
-		     i = bkey_p_next(i))
-			BUG_ON(i->type >= min_key_type);
-	}
-
-	return ret;
-}
-
-/* Insert */
-
-static void bch2_bset_fix_lookup_table(struct btree *b,
-				       struct bset_tree *t,
-				       struct bkey_packed *_where,
-				       unsigned clobber_u64s,
-				       unsigned new_u64s)
-{
-	int shift = new_u64s - clobber_u64s;
-	unsigned l, j, where = __btree_node_key_to_offset(b, _where);
-
-	EBUG_ON(bset_has_ro_aux_tree(t));
-
-	if (!bset_has_rw_aux_tree(t))
-		return;
-
-	/* returns first entry >= where */
-	l = rw_aux_tree_bsearch(b, t, where);
-
-	if (!l) /* never delete first entry */
-		l++;
-	else if (l < t->size &&
-		 where < t->end_offset &&
-		 rw_aux_tree(b, t)[l].offset == where)
-		rw_aux_tree_set(b, t, l++, _where);
-
-	/* l now > where */
-
-	for (j = l;
-	     j < t->size &&
-	     rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
-	     j++)
-		;
-
-	if (j < t->size &&
-	    rw_aux_tree(b, t)[j].offset + shift ==
-	    rw_aux_tree(b, t)[l - 1].offset)
-		j++;
-
-	memmove(&rw_aux_tree(b, t)[l],
-		&rw_aux_tree(b, t)[j],
-		(void *) &rw_aux_tree(b, t)[t->size] -
-		(void *) &rw_aux_tree(b, t)[j]);
-	t->size -= j - l;
-
-	for (j = l; j < t->size; j++)
-		rw_aux_tree(b, t)[j].offset += shift;
-
-	EBUG_ON(l < t->size &&
-		rw_aux_tree(b, t)[l].offset ==
-		rw_aux_tree(b, t)[l - 1].offset);
-
-	if (t->size < bset_rw_tree_capacity(b, t) &&
-	    (l < t->size
-	     ? rw_aux_tree(b, t)[l].offset
-	     : t->end_offset) -
-	    rw_aux_tree(b, t)[l - 1].offset >
-	    L1_CACHE_BYTES / sizeof(u64)) {
-		struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
-		struct bkey_packed *end = l < t->size
-			? rw_aux_to_bkey(b, t, l)
-			: btree_bkey_last(b, t);
-		struct bkey_packed *k = start;
-
-		while (1) {
-			k = bkey_p_next(k);
-			if (k == end)
-				break;
-
-			if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
-				memmove(&rw_aux_tree(b, t)[l + 1],
-					&rw_aux_tree(b, t)[l],
-					(void *) &rw_aux_tree(b, t)[t->size] -
-					(void *) &rw_aux_tree(b, t)[l]);
-				t->size++;
-				rw_aux_tree_set(b, t, l, k);
-				break;
-			}
-		}
-	}
-
-	bch2_bset_verify_rw_aux_tree(b, t);
-	bset_aux_tree_verify(b);
-}
-
-void bch2_bset_insert(struct btree *b,
-		      struct btree_node_iter *iter,
-		      struct bkey_packed *where,
-		      struct bkey_i *insert,
-		      unsigned clobber_u64s)
-{
-	struct bkey_format *f = &b->format;
-	struct bset_tree *t = bset_tree_last(b);
-	struct bkey_packed packed, *src = bkey_to_packed(insert);
-
-	bch2_bset_verify_rw_aux_tree(b, t);
-	bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s);
-
-	if (bch2_bkey_pack_key(&packed, &insert->k, f))
-		src = &packed;
-
-	if (!bkey_deleted(&insert->k))
-		btree_keys_account_key_add(&b->nr, t - b->set, src);
-
-	if (src->u64s != clobber_u64s) {
-		u64 *src_p = (u64 *) where->_data + clobber_u64s;
-		u64 *dst_p = (u64 *) where->_data + src->u64s;
-
-		EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
-			(int) clobber_u64s - src->u64s);
-
-		memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
-		le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s);
-		set_btree_bset_end(b, t);
-	}
-
-	memcpy_u64s_small(where, src,
-		    bkeyp_key_u64s(f, src));
-	memcpy_u64s(bkeyp_val(f, where), &insert->v,
-		    bkeyp_val_u64s(f, src));
-
-	if (src->u64s != clobber_u64s)
-		bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
-
-	bch2_verify_btree_nr_keys(b);
-}
-
-void bch2_bset_delete(struct btree *b,
-		      struct bkey_packed *where,
-		      unsigned clobber_u64s)
-{
-	struct bset_tree *t = bset_tree_last(b);
-	u64 *src_p = (u64 *) where->_data + clobber_u64s;
-	u64 *dst_p = where->_data;
-
-	bch2_bset_verify_rw_aux_tree(b, t);
-
-	EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
-
-	memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
-	le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s);
-	set_btree_bset_end(b, t);
-
-	bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0);
-}
-
-/* Lookup */
-
-__flatten
-static struct bkey_packed *bset_search_write_set(const struct btree *b,
-				struct bset_tree *t,
-				struct bpos *search)
-{
-	unsigned l = 0, r = t->size;
-
-	while (l + 1 != r) {
-		unsigned m = (l + r) >> 1;
-
-		if (bpos_lt(rw_aux_tree(b, t)[m].k, *search))
-			l = m;
-		else
-			r = m;
-	}
-
-	return rw_aux_to_bkey(b, t, l);
-}
-
-static inline void prefetch_four_cachelines(void *p)
-{
-#ifdef CONFIG_X86_64
-	asm("prefetcht0 (-127 + 64 * 0)(%0);"
-	    "prefetcht0 (-127 + 64 * 1)(%0);"
-	    "prefetcht0 (-127 + 64 * 2)(%0);"
-	    "prefetcht0 (-127 + 64 * 3)(%0);"
-	    :
-	    : "r" (p + 127));
-#else
-	prefetch(p + L1_CACHE_BYTES * 0);
-	prefetch(p + L1_CACHE_BYTES * 1);
-	prefetch(p + L1_CACHE_BYTES * 2);
-	prefetch(p + L1_CACHE_BYTES * 3);
-#endif
-}
-
-static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
-					      const struct bkey_float *f,
-					      unsigned idx)
-{
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-	unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits;
-
-	return f->exponent > key_bits_start;
-#else
-	unsigned key_bits_end = high_bit_offset + b->nr_key_bits;
-
-	return f->exponent + BKEY_MANTISSA_BITS < key_bits_end;
-#endif
-}
-
-__flatten
-static struct bkey_packed *bset_search_tree(const struct btree *b,
-				const struct bset_tree *t,
-				const struct bpos *search,
-				const struct bkey_packed *packed_search)
-{
-	struct ro_aux_tree *base = ro_aux_tree_base(b, t);
-	struct bkey_float *f;
-	struct bkey_packed *k;
-	unsigned inorder, n = 1, l, r;
-	int cmp;
-
-	do {
-		if (likely(n << 4 < t->size))
-			prefetch(&base->f[n << 4]);
-
-		f = &base->f[n];
-		if (unlikely(f->exponent >= BFLOAT_FAILED))
-			goto slowpath;
-
-		l = f->mantissa;
-		r = bkey_mantissa(packed_search, f, n);
-
-		if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n))
-			goto slowpath;
-
-		n = n * 2 + (l < r);
-		continue;
-slowpath:
-		k = tree_to_bkey(b, t, n);
-		cmp = bkey_cmp_p_or_unp(b, k, packed_search, search);
-		if (!cmp)
-			return k;
-
-		n = n * 2 + (cmp < 0);
-	} while (n < t->size);
-
-	inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra);
-
-	/*
-	 * n would have been the node we recursed to - the low bit tells us if
-	 * we recursed left or recursed right.
-	 */
-	if (likely(!(n & 1))) {
-		--inorder;
-		if (unlikely(!inorder))
-			return btree_bkey_first(b, t);
-
-		f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)];
-	}
-
-	return cacheline_to_bkey(b, t, inorder, f->key_offset);
-}
-
-static __always_inline __flatten
-struct bkey_packed *__bch2_bset_search(struct btree *b,
-				struct bset_tree *t,
-				struct bpos *search,
-				const struct bkey_packed *lossy_packed_search)
-{
-
-	/*
-	 * First, we search for a cacheline, then lastly we do a linear search
-	 * within that cacheline.
-	 *
-	 * To search for the cacheline, there's three different possibilities:
-	 *  * The set is too small to have a search tree, so we just do a linear
-	 *    search over the whole set.
-	 *  * The set is the one we're currently inserting into; keeping a full
-	 *    auxiliary search tree up to date would be too expensive, so we
-	 *    use a much simpler lookup table to do a binary search -
-	 *    bset_search_write_set().
-	 *  * Or we use the auxiliary search tree we constructed earlier -
-	 *    bset_search_tree()
-	 */
-
-	switch (bset_aux_tree_type(t)) {
-	case BSET_NO_AUX_TREE:
-		return btree_bkey_first(b, t);
-	case BSET_RW_AUX_TREE:
-		return bset_search_write_set(b, t, search);
-	case BSET_RO_AUX_TREE:
-		return bset_search_tree(b, t, search, lossy_packed_search);
-	default:
-		BUG();
-	}
-}
-
-static __always_inline __flatten
-struct bkey_packed *bch2_bset_search_linear(struct btree *b,
-				struct bset_tree *t,
-				struct bpos *search,
-				struct bkey_packed *packed_search,
-				const struct bkey_packed *lossy_packed_search,
-				struct bkey_packed *m)
-{
-	if (lossy_packed_search)
-		while (m != btree_bkey_last(b, t) &&
-		       bkey_iter_cmp_p_or_unp(b, m,
-					lossy_packed_search, search) < 0)
-			m = bkey_p_next(m);
-
-	if (!packed_search)
-		while (m != btree_bkey_last(b, t) &&
-		       bkey_iter_pos_cmp(b, m, search) < 0)
-			m = bkey_p_next(m);
-
-	if (bch2_expensive_debug_checks) {
-		struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
-
-		BUG_ON(prev &&
-		       bkey_iter_cmp_p_or_unp(b, prev,
-					packed_search, search) >= 0);
-	}
-
-	return m;
-}
-
-/* Btree node iterator */
-
-static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
-			      struct btree *b,
-			      const struct bkey_packed *k,
-			      const struct bkey_packed *end)
-{
-	if (k != end) {
-		struct btree_node_iter_set *pos;
-
-		btree_node_iter_for_each(iter, pos)
-			;
-
-		BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data));
-		*pos = (struct btree_node_iter_set) {
-			__btree_node_key_to_offset(b, k),
-			__btree_node_key_to_offset(b, end)
-		};
-	}
-}
-
-void bch2_btree_node_iter_push(struct btree_node_iter *iter,
-			       struct btree *b,
-			       const struct bkey_packed *k,
-			       const struct bkey_packed *end)
-{
-	__bch2_btree_node_iter_push(iter, b, k, end);
-	bch2_btree_node_iter_sort(iter, b);
-}
-
-noinline __flatten __cold
-static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
-			      struct btree *b, struct bpos *search)
-{
-	struct bkey_packed *k;
-
-	trace_bkey_pack_pos_fail(search);
-
-	bch2_btree_node_iter_init_from_start(iter, b);
-
-	while ((k = bch2_btree_node_iter_peek(iter, b)) &&
-	       bkey_iter_pos_cmp(b, k, search) < 0)
-		bch2_btree_node_iter_advance(iter, b);
-}
-
-/**
- * bch2_btree_node_iter_init - initialize a btree node iterator, starting from a
- * given position
- *
- * @iter:	iterator to initialize
- * @b:		btree node to search
- * @search:	search key
- *
- * Main entry point to the lookup code for individual btree nodes:
- *
- * NOTE:
- *
- * When you don't filter out deleted keys, btree nodes _do_ contain duplicate
- * keys. This doesn't matter for most code, but it does matter for lookups.
- *
- * Some adjacent keys with a string of equal keys:
- *	i j k k k k l m
- *
- * If you search for k, the lookup code isn't guaranteed to return you any
- * specific k. The lookup code is conceptually doing a binary search and
- * iterating backwards is very expensive so if the pivot happens to land at the
- * last k that's what you'll get.
- *
- * This works out ok, but it's something to be aware of:
- *
- *  - For non extents, we guarantee that the live key comes last - see
- *    btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't
- *    see will only be deleted keys you don't care about.
- *
- *  - For extents, deleted keys sort last (see the comment at the top of this
- *    file). But when you're searching for extents, you actually want the first
- *    key strictly greater than your search key - an extent that compares equal
- *    to the search key is going to have 0 sectors after the search key.
- *
- *    But this does mean that we can't just search for
- *    bpos_successor(start_of_range) to get the first extent that overlaps with
- *    the range we want - if we're unlucky and there's an extent that ends
- *    exactly where we searched, then there could be a deleted key at the same
- *    position and we'd get that when we search instead of the preceding extent
- *    we needed.
- *
- *    So we've got to search for start_of_range, then after the lookup iterate
- *    past any extents that compare equal to the position we searched for.
- */
-__flatten
-void bch2_btree_node_iter_init(struct btree_node_iter *iter,
-			       struct btree *b, struct bpos *search)
-{
-	struct bkey_packed p, *packed_search = NULL;
-	struct btree_node_iter_set *pos = iter->data;
-	struct bkey_packed *k[MAX_BSETS];
-	unsigned i;
-
-	EBUG_ON(bpos_lt(*search, b->data->min_key));
-	EBUG_ON(bpos_gt(*search, b->data->max_key));
-	bset_aux_tree_verify(b);
-
-	memset(iter, 0, sizeof(*iter));
-
-	switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) {
-	case BKEY_PACK_POS_EXACT:
-		packed_search = &p;
-		break;
-	case BKEY_PACK_POS_SMALLER:
-		packed_search = NULL;
-		break;
-	case BKEY_PACK_POS_FAIL:
-		btree_node_iter_init_pack_failed(iter, b, search);
-		return;
-	}
-
-	for (i = 0; i < b->nsets; i++) {
-		k[i] = __bch2_bset_search(b, b->set + i, search, &p);
-		prefetch_four_cachelines(k[i]);
-	}
-
-	for (i = 0; i < b->nsets; i++) {
-		struct bset_tree *t = b->set + i;
-		struct bkey_packed *end = btree_bkey_last(b, t);
-
-		k[i] = bch2_bset_search_linear(b, t, search,
-					       packed_search, &p, k[i]);
-		if (k[i] != end)
-			*pos++ = (struct btree_node_iter_set) {
-				__btree_node_key_to_offset(b, k[i]),
-				__btree_node_key_to_offset(b, end)
-			};
-	}
-
-	bch2_btree_node_iter_sort(iter, b);
-}
-
-void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
-					  struct btree *b)
-{
-	memset(iter, 0, sizeof(*iter));
-
-	for_each_bset(b, t)
-		__bch2_btree_node_iter_push(iter, b,
-					   btree_bkey_first(b, t),
-					   btree_bkey_last(b, t));
-	bch2_btree_node_iter_sort(iter, b);
-}
-
-struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter,
-						  struct btree *b,
-						  struct bset_tree *t)
-{
-	struct btree_node_iter_set *set;
-
-	btree_node_iter_for_each(iter, set)
-		if (set->end == t->end_offset)
-			return __btree_node_offset_to_key(b, set->k);
-
-	return btree_bkey_last(b, t);
-}
-
-static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter,
-					    struct btree *b,
-					    unsigned first)
-{
-	bool ret;
-
-	if ((ret = (btree_node_iter_cmp(b,
-					iter->data[first],
-					iter->data[first + 1]) > 0)))
-		swap(iter->data[first], iter->data[first + 1]);
-	return ret;
-}
-
-void bch2_btree_node_iter_sort(struct btree_node_iter *iter,
-			       struct btree *b)
-{
-	/* unrolled bubble sort: */
-
-	if (!__btree_node_iter_set_end(iter, 2)) {
-		btree_node_iter_sort_two(iter, b, 0);
-		btree_node_iter_sort_two(iter, b, 1);
-	}
-
-	if (!__btree_node_iter_set_end(iter, 1))
-		btree_node_iter_sort_two(iter, b, 0);
-}
-
-void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter,
-				   struct btree_node_iter_set *set)
-{
-	struct btree_node_iter_set *last =
-		iter->data + ARRAY_SIZE(iter->data) - 1;
-
-	memmove(&set[0], &set[1], (void *) last - (void *) set);
-	*last = (struct btree_node_iter_set) { 0, 0 };
-}
-
-static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
-						  struct btree *b)
-{
-	iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s;
-
-	EBUG_ON(iter->data->k > iter->data->end);
-
-	if (unlikely(__btree_node_iter_set_end(iter, 0))) {
-		/* avoid an expensive memmove call: */
-		iter->data[0] = iter->data[1];
-		iter->data[1] = iter->data[2];
-		iter->data[2] = (struct btree_node_iter_set) { 0, 0 };
-		return;
-	}
-
-	if (__btree_node_iter_set_end(iter, 1))
-		return;
-
-	if (!btree_node_iter_sort_two(iter, b, 0))
-		return;
-
-	if (__btree_node_iter_set_end(iter, 2))
-		return;
-
-	btree_node_iter_sort_two(iter, b, 1);
-}
-
-void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
-				  struct btree *b)
-{
-	if (bch2_expensive_debug_checks) {
-		bch2_btree_node_iter_verify(iter, b);
-		bch2_btree_node_iter_next_check(iter, b);
-	}
-
-	__bch2_btree_node_iter_advance(iter, b);
-}
-
-/*
- * Expensive:
- */
-struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
-						  struct btree *b)
-{
-	struct bkey_packed *k, *prev = NULL;
-	struct btree_node_iter_set *set;
-	unsigned end = 0;
-
-	if (bch2_expensive_debug_checks)
-		bch2_btree_node_iter_verify(iter, b);
-
-	for_each_bset(b, t) {
-		k = bch2_bkey_prev_all(b, t,
-			bch2_btree_node_iter_bset_pos(iter, b, t));
-		if (k &&
-		    (!prev || bkey_iter_cmp(b, k, prev) > 0)) {
-			prev = k;
-			end = t->end_offset;
-		}
-	}
-
-	if (!prev)
-		return NULL;
-
-	/*
-	 * We're manually memmoving instead of just calling sort() to ensure the
-	 * prev we picked ends up in slot 0 - sort won't necessarily put it
-	 * there because of duplicate deleted keys:
-	 */
-	btree_node_iter_for_each(iter, set)
-		if (set->end == end)
-			goto found;
-
-	BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]);
-found:
-	BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data));
-
-	memmove(&iter->data[1],
-		&iter->data[0],
-		(void *) set - (void *) &iter->data[0]);
-
-	iter->data[0].k = __btree_node_key_to_offset(b, prev);
-	iter->data[0].end = end;
-
-	if (bch2_expensive_debug_checks)
-		bch2_btree_node_iter_verify(iter, b);
-	return prev;
-}
-
-struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter,
-					      struct btree *b)
-{
-	struct bkey_packed *prev;
-
-	do {
-		prev = bch2_btree_node_iter_prev_all(iter, b);
-	} while (prev && bkey_deleted(prev));
-
-	return prev;
-}
-
-struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
-						 struct btree *b,
-						 struct bkey *u)
-{
-	struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b);
-
-	return k ? bkey_disassemble(b, k, u) : bkey_s_c_null;
-}
-
-/* Mergesort */
-
-void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats)
-{
-	for_each_bset_c(b, t) {
-		enum bset_aux_tree_type type = bset_aux_tree_type(t);
-		size_t j;
-
-		stats->sets[type].nr++;
-		stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) *
-			sizeof(u64);
-
-		if (bset_has_ro_aux_tree(t)) {
-			stats->floats += t->size - 1;
-
-			for (j = 1; j < t->size; j++)
-				stats->failed +=
-					bkey_float(b, t, j)->exponent ==
-					BFLOAT_FAILED;
-		}
-	}
-}
-
-void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
-			 struct bkey_packed *k)
-{
-	struct bset_tree *t = bch2_bkey_to_bset(b, k);
-	struct bkey uk;
-	unsigned j, inorder;
-
-	if (!bset_has_ro_aux_tree(t))
-		return;
-
-	inorder = bkey_to_cacheline(b, t, k);
-	if (!inorder || inorder >= t->size)
-		return;
-
-	j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra);
-	if (k != tree_to_bkey(b, t, j))
-		return;
-
-	switch (bkey_float(b, t, j)->exponent) {
-	case BFLOAT_FAILED:
-		uk = bkey_unpack_key(b, k);
-		prt_printf(out,
-		       "    failed unpacked at depth %u\n"
-		       "\t",
-		       ilog2(j));
-		bch2_bpos_to_text(out, uk.p);
-		prt_printf(out, "\n");
-		break;
-	}
-}
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
deleted file mode 100644
index 5c6c7a14fa0f..000000000000
--- a/fs/bcachefs/bset.h
+++ /dev/null
@@ -1,544 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BSET_H
-#define _BCACHEFS_BSET_H
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-
-#include "bcachefs.h"
-#include "bkey.h"
-#include "bkey_methods.h"
-#include "btree_types.h"
-#include "util.h" /* for time_stats */
-#include "vstructs.h"
-
-/*
- * BKEYS:
- *
- * A bkey contains a key, a size field, a variable number of pointers, and some
- * ancillary flag bits.
- *
- * We use two different functions for validating bkeys, bkey_invalid and
- * bkey_deleted().
- *
- * The one exception to the rule that ptr_invalid() filters out invalid keys is
- * that it also filters out keys of size 0 - these are keys that have been
- * completely overwritten. It'd be safe to delete these in memory while leaving
- * them on disk, just unnecessary work - so we filter them out when resorting
- * instead.
- *
- * We can't filter out stale keys when we're resorting, because garbage
- * collection needs to find them to ensure bucket gens don't wrap around -
- * unless we're rewriting the btree node those stale keys still exist on disk.
- *
- * We also implement functions here for removing some number of sectors from the
- * front or the back of a bkey - this is mainly used for fixing overlapping
- * extents, by removing the overlapping sectors from the older key.
- *
- * BSETS:
- *
- * A bset is an array of bkeys laid out contiguously in memory in sorted order,
- * along with a header. A btree node is made up of a number of these, written at
- * different times.
- *
- * There could be many of them on disk, but we never allow there to be more than
- * 4 in memory - we lazily resort as needed.
- *
- * We implement code here for creating and maintaining auxiliary search trees
- * (described below) for searching an individial bset, and on top of that we
- * implement a btree iterator.
- *
- * BTREE ITERATOR:
- *
- * Most of the code in bcache doesn't care about an individual bset - it needs
- * to search entire btree nodes and iterate over them in sorted order.
- *
- * The btree iterator code serves both functions; it iterates through the keys
- * in a btree node in sorted order, starting from either keys after a specific
- * point (if you pass it a search key) or the start of the btree node.
- *
- * AUXILIARY SEARCH TREES:
- *
- * Since keys are variable length, we can't use a binary search on a bset - we
- * wouldn't be able to find the start of the next key. But binary searches are
- * slow anyways, due to terrible cache behaviour; bcache originally used binary
- * searches and that code topped out at under 50k lookups/second.
- *
- * So we need to construct some sort of lookup table. Since we only insert keys
- * into the last (unwritten) set, most of the keys within a given btree node are
- * usually in sets that are mostly constant. We use two different types of
- * lookup tables to take advantage of this.
- *
- * Both lookup tables share in common that they don't index every key in the
- * set; they index one key every BSET_CACHELINE bytes, and then a linear search
- * is used for the rest.
- *
- * For sets that have been written to disk and are no longer being inserted
- * into, we construct a binary search tree in an array - traversing a binary
- * search tree in an array gives excellent locality of reference and is very
- * fast, since both children of any node are adjacent to each other in memory
- * (and their grandchildren, and great grandchildren...) - this means
- * prefetching can be used to great effect.
- *
- * It's quite useful performance wise to keep these nodes small - not just
- * because they're more likely to be in L2, but also because we can prefetch
- * more nodes on a single cacheline and thus prefetch more iterations in advance
- * when traversing this tree.
- *
- * Nodes in the auxiliary search tree must contain both a key to compare against
- * (we don't want to fetch the key from the set, that would defeat the purpose),
- * and a pointer to the key. We use a few tricks to compress both of these.
- *
- * To compress the pointer, we take advantage of the fact that one node in the
- * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
- * a function (to_inorder()) that takes the index of a node in a binary tree and
- * returns what its index would be in an inorder traversal, so we only have to
- * store the low bits of the offset.
- *
- * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
- * compress that,  we take advantage of the fact that when we're traversing the
- * search tree at every iteration we know that both our search key and the key
- * we're looking for lie within some range - bounded by our previous
- * comparisons. (We special case the start of a search so that this is true even
- * at the root of the tree).
- *
- * So we know the key we're looking for is between a and b, and a and b don't
- * differ higher than bit 50, we don't need to check anything higher than bit
- * 50.
- *
- * We don't usually need the rest of the bits, either; we only need enough bits
- * to partition the key range we're currently checking.  Consider key n - the
- * key our auxiliary search tree node corresponds to, and key p, the key
- * immediately preceding n.  The lowest bit we need to store in the auxiliary
- * search tree is the highest bit that differs between n and p.
- *
- * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
- * comparison. But we'd really like our nodes in the auxiliary search tree to be
- * of fixed size.
- *
- * The solution is to make them fixed size, and when we're constructing a node
- * check if p and n differed in the bits we needed them to. If they don't we
- * flag that node, and when doing lookups we fallback to comparing against the
- * real key. As long as this doesn't happen to often (and it seems to reliably
- * happen a bit less than 1% of the time), we win - even on failures, that key
- * is then more likely to be in cache than if we were doing binary searches all
- * the way, since we're touching so much less memory.
- *
- * The keys in the auxiliary search tree are stored in (software) floating
- * point, with an exponent and a mantissa. The exponent needs to be big enough
- * to address all the bits in the original key, but the number of bits in the
- * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
- *
- * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
- * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
- * We need one node per 128 bytes in the btree node, which means the auxiliary
- * search trees take up 3% as much memory as the btree itself.
- *
- * Constructing these auxiliary search trees is moderately expensive, and we
- * don't want to be constantly rebuilding the search tree for the last set
- * whenever we insert another key into it. For the unwritten set, we use a much
- * simpler lookup table - it's just a flat array, so index i in the lookup table
- * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
- * within each byte range works the same as with the auxiliary search trees.
- *
- * These are much easier to keep up to date when we insert a key - we do it
- * somewhat lazily; when we shift a key up we usually just increment the pointer
- * to it, only when it would overflow do we go to the trouble of finding the
- * first key in that range of bytes again.
- */
-
-enum bset_aux_tree_type {
-	BSET_NO_AUX_TREE,
-	BSET_RO_AUX_TREE,
-	BSET_RW_AUX_TREE,
-};
-
-#define BSET_TREE_NR_TYPES	3
-
-#define BSET_NO_AUX_TREE_VAL	(U16_MAX)
-#define BSET_RW_AUX_TREE_VAL	(U16_MAX - 1)
-
-static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t)
-{
-	switch (t->extra) {
-	case BSET_NO_AUX_TREE_VAL:
-		EBUG_ON(t->size);
-		return BSET_NO_AUX_TREE;
-	case BSET_RW_AUX_TREE_VAL:
-		EBUG_ON(!t->size);
-		return BSET_RW_AUX_TREE;
-	default:
-		EBUG_ON(!t->size);
-		return BSET_RO_AUX_TREE;
-	}
-}
-
-/*
- * BSET_CACHELINE was originally intended to match the hardware cacheline size -
- * it used to be 64, but I realized the lookup code would touch slightly less
- * memory if it was 128.
- *
- * It definites the number of bytes (in struct bset) per struct bkey_float in
- * the auxiliar search tree - when we're done searching the bset_float tree we
- * have this many bytes left that we do a linear search over.
- *
- * Since (after level 5) every level of the bset_tree is on a new cacheline,
- * we're touching one fewer cacheline in the bset tree in exchange for one more
- * cacheline in the linear search - but the linear search might stop before it
- * gets to the second cacheline.
- */
-
-#define BSET_CACHELINE		256
-
-static inline size_t btree_keys_cachelines(const struct btree *b)
-{
-	return (1U << b->byte_order) / BSET_CACHELINE;
-}
-
-static inline size_t btree_aux_data_bytes(const struct btree *b)
-{
-	return btree_keys_cachelines(b) * 8;
-}
-
-static inline size_t btree_aux_data_u64s(const struct btree *b)
-{
-	return btree_aux_data_bytes(b) / sizeof(u64);
-}
-
-#define for_each_bset(_b, _t)						\
-	for (struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
-
-#define for_each_bset_c(_b, _t)						\
-	for (const struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
-
-#define bset_tree_for_each_key(_b, _t, _k)				\
-	for (_k = btree_bkey_first(_b, _t);				\
-	     _k != btree_bkey_last(_b, _t);				\
-	     _k = bkey_p_next(_k))
-
-static inline bool bset_has_ro_aux_tree(const struct bset_tree *t)
-{
-	return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
-}
-
-static inline bool bset_has_rw_aux_tree(struct bset_tree *t)
-{
-	return bset_aux_tree_type(t) == BSET_RW_AUX_TREE;
-}
-
-static inline void bch2_bset_set_no_aux_tree(struct btree *b,
-					    struct bset_tree *t)
-{
-	BUG_ON(t < b->set);
-
-	for (; t < b->set + ARRAY_SIZE(b->set); t++) {
-		t->size = 0;
-		t->extra = BSET_NO_AUX_TREE_VAL;
-		t->aux_data_offset = U16_MAX;
-	}
-}
-
-static inline void btree_node_set_format(struct btree *b,
-					 struct bkey_format f)
-{
-	int len;
-
-	b->format	= f;
-	b->nr_key_bits	= bkey_format_key_bits(&f);
-
-	len = bch2_compile_bkey_format(&b->format, b->aux_data);
-	BUG_ON(len < 0 || len > U8_MAX);
-
-	b->unpack_fn_len = len;
-
-	bch2_bset_set_no_aux_tree(b, b->set);
-}
-
-static inline struct bset *bset_next_set(struct btree *b,
-					 unsigned block_bytes)
-{
-	struct bset *i = btree_bset_last(b);
-
-	EBUG_ON(!is_power_of_2(block_bytes));
-
-	return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
-}
-
-void bch2_btree_keys_init(struct btree *);
-
-void bch2_bset_init_first(struct btree *, struct bset *);
-void bch2_bset_init_next(struct btree *, struct btree_node_entry *);
-void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
-
-void bch2_bset_insert(struct btree *, struct btree_node_iter *,
-		     struct bkey_packed *, struct bkey_i *, unsigned);
-void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
-
-/* Bkey utility code */
-
-/* packed or unpacked */
-static inline int bkey_cmp_p_or_unp(const struct btree *b,
-				    const struct bkey_packed *l,
-				    const struct bkey_packed *r_packed,
-				    const struct bpos *r)
-{
-	EBUG_ON(r_packed && !bkey_packed(r_packed));
-
-	if (unlikely(!bkey_packed(l)))
-		return bpos_cmp(packed_to_bkey_c(l)->p, *r);
-
-	if (likely(r_packed))
-		return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b);
-
-	return __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
-}
-
-static inline struct bset_tree *
-bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k)
-{
-	unsigned offset = __btree_node_key_to_offset(b, k);
-
-	for_each_bset(b, t)
-		if (offset <= t->end_offset) {
-			EBUG_ON(offset < btree_bkey_first_offset(t));
-			return t;
-		}
-
-	BUG();
-}
-
-struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *);
-
-struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *,
-					  struct bkey_packed *, unsigned);
-
-static inline struct bkey_packed *
-bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
-{
-	return bch2_bkey_prev_filter(b, t, k, 0);
-}
-
-static inline struct bkey_packed *
-bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
-{
-	return bch2_bkey_prev_filter(b, t, k, 1);
-}
-
-/* Btree key iteration */
-
-void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
-			      const struct bkey_packed *,
-			      const struct bkey_packed *);
-void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *,
-			       struct bpos *);
-void bch2_btree_node_iter_init_from_start(struct btree_node_iter *,
-					  struct btree *);
-struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *,
-						 struct btree *,
-						 struct bset_tree *);
-
-void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *);
-void bch2_btree_node_iter_set_drop(struct btree_node_iter *,
-				   struct btree_node_iter_set *);
-void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *);
-
-#define btree_node_iter_for_each(_iter, _set)				\
-	for (_set = (_iter)->data;					\
-	     _set < (_iter)->data + ARRAY_SIZE((_iter)->data) &&	\
-	     (_set)->k != (_set)->end;					\
-	     _set++)
-
-static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter,
-					     unsigned i)
-{
-	return iter->data[i].k == iter->data[i].end;
-}
-
-static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter)
-{
-	return __btree_node_iter_set_end(iter, 0);
-}
-
-/*
- * When keys compare equal, deleted keys compare first:
- *
- * XXX: only need to compare pointers for keys that are both within a
- * btree_node_iterator - we need to break ties for prev() to work correctly
- */
-static inline int bkey_iter_cmp(const struct btree *b,
-				const struct bkey_packed *l,
-				const struct bkey_packed *r)
-{
-	return bch2_bkey_cmp_packed(b, l, r)
-		?: (int) bkey_deleted(r) - (int) bkey_deleted(l)
-		?: cmp_int(l, r);
-}
-
-static inline int btree_node_iter_cmp(const struct btree *b,
-				      struct btree_node_iter_set l,
-				      struct btree_node_iter_set r)
-{
-	return bkey_iter_cmp(b,
-			__btree_node_offset_to_key(b, l.k),
-			__btree_node_offset_to_key(b, r.k));
-}
-
-/* These assume r (the search key) is not a deleted key: */
-static inline int bkey_iter_pos_cmp(const struct btree *b,
-			const struct bkey_packed *l,
-			const struct bpos *r)
-{
-	return bkey_cmp_left_packed(b, l, r)
-		?: -((int) bkey_deleted(l));
-}
-
-static inline int bkey_iter_cmp_p_or_unp(const struct btree *b,
-				    const struct bkey_packed *l,
-				    const struct bkey_packed *r_packed,
-				    const struct bpos *r)
-{
-	return bkey_cmp_p_or_unp(b, l, r_packed, r)
-		?: -((int) bkey_deleted(l));
-}
-
-static inline struct bkey_packed *
-__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
-				struct btree *b)
-{
-	return __btree_node_offset_to_key(b, iter->data->k);
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b)
-{
-	return !bch2_btree_node_iter_end(iter)
-		? __btree_node_offset_to_key(b, iter->data->k)
-		: NULL;
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
-{
-	struct bkey_packed *k;
-
-	while ((k = bch2_btree_node_iter_peek_all(iter, b)) &&
-	       bkey_deleted(k))
-		bch2_btree_node_iter_advance(iter, b);
-
-	return k;
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
-{
-	struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b);
-
-	if (ret)
-		bch2_btree_node_iter_advance(iter, b);
-
-	return ret;
-}
-
-struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *,
-						  struct btree *);
-struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *,
-					      struct btree *);
-
-struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
-						struct btree *,
-						struct bkey *);
-
-#define for_each_btree_node_key(b, k, iter)				\
-	for (bch2_btree_node_iter_init_from_start((iter), (b));		\
-	     (k = bch2_btree_node_iter_peek((iter), (b)));		\
-	     bch2_btree_node_iter_advance(iter, b))
-
-#define for_each_btree_node_key_unpack(b, k, iter, unpacked)		\
-	for (bch2_btree_node_iter_init_from_start((iter), (b));		\
-	     (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
-	     bch2_btree_node_iter_advance(iter, b))
-
-/* Accounting: */
-
-struct btree_nr_keys bch2_btree_node_count_keys(struct btree *);
-
-static inline void btree_keys_account_key(struct btree_nr_keys *n,
-					  unsigned bset,
-					  struct bkey_packed *k,
-					  int sign)
-{
-	n->live_u64s		+= k->u64s * sign;
-	n->bset_u64s[bset]	+= k->u64s * sign;
-
-	if (bkey_packed(k))
-		n->packed_keys	+= sign;
-	else
-		n->unpacked_keys += sign;
-}
-
-static inline void btree_keys_account_val_delta(struct btree *b,
-						struct bkey_packed *k,
-						int delta)
-{
-	struct bset_tree *t = bch2_bkey_to_bset(b, k);
-
-	b->nr.live_u64s			+= delta;
-	b->nr.bset_u64s[t - b->set]	+= delta;
-}
-
-#define btree_keys_account_key_add(_nr, _bset_idx, _k)		\
-	btree_keys_account_key(_nr, _bset_idx, _k, 1)
-#define btree_keys_account_key_drop(_nr, _bset_idx, _k)	\
-	btree_keys_account_key(_nr, _bset_idx, _k, -1)
-
-#define btree_account_key_add(_b, _k)				\
-	btree_keys_account_key(&(_b)->nr,			\
-		bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1)
-#define btree_account_key_drop(_b, _k)				\
-	btree_keys_account_key(&(_b)->nr,			\
-		bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1)
-
-struct bset_stats {
-	struct {
-		size_t nr, bytes;
-	} sets[BSET_TREE_NR_TYPES];
-
-	size_t floats;
-	size_t failed;
-};
-
-void bch2_btree_keys_stats(const struct btree *, struct bset_stats *);
-void bch2_bfloat_to_text(struct printbuf *, struct btree *,
-			 struct bkey_packed *);
-
-/* Debug stuff */
-
-void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned);
-void bch2_dump_btree_node(struct bch_fs *, struct btree *);
-void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-void __bch2_verify_btree_nr_keys(struct btree *);
-void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
-void bch2_verify_insert_pos(struct btree *, struct bkey_packed *,
-			    struct bkey_packed *, unsigned);
-
-#else
-
-static inline void __bch2_verify_btree_nr_keys(struct btree *b) {}
-static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
-					      struct btree *b) {}
-static inline void bch2_verify_insert_pos(struct btree *b,
-					  struct bkey_packed *where,
-					  struct bkey_packed *insert,
-					  unsigned clobber_u64s) {}
-#endif
-
-static inline void bch2_verify_btree_nr_keys(struct btree *b)
-{
-	if (bch2_debug_check_btree_accounting)
-		__bch2_verify_btree_nr_keys(b);
-}
-
-#endif /* _BCACHEFS_BSET_H */
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
deleted file mode 100644
index 9e4ed75d3675..000000000000
--- a/fs/bcachefs/btree_cache.c
+++ /dev/null
@@ -1,1345 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bbpos.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "debug.h"
-#include "errcode.h"
-#include "error.h"
-#include "journal.h"
-#include "trace.h"
-
-#include <linux/prefetch.h>
-#include <linux/sched/mm.h>
-
-#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
-do {						 \
-	if (shrinker_counter)			 \
-		bc->not_freed_##counter++;	 \
-} while (0)
-
-const char * const bch2_btree_node_flags[] = {
-#define x(f)	#f,
-	BTREE_FLAGS()
-#undef x
-	NULL
-};
-
-void bch2_recalc_btree_reserve(struct bch_fs *c)
-{
-	unsigned i, reserve = 16;
-
-	if (!c->btree_roots_known[0].b)
-		reserve += 8;
-
-	for (i = 0; i < btree_id_nr_alive(c); i++) {
-		struct btree_root *r = bch2_btree_id_root(c, i);
-
-		if (r->b)
-			reserve += min_t(unsigned, 1, r->b->c.level) * 8;
-	}
-
-	c->btree_cache.reserve = reserve;
-}
-
-static inline unsigned btree_cache_can_free(struct btree_cache *bc)
-{
-	return max_t(int, 0, bc->used - bc->reserve);
-}
-
-static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
-{
-	if (b->c.lock.readers)
-		list_move(&b->list, &bc->freed_pcpu);
-	else
-		list_move(&b->list, &bc->freed_nonpcpu);
-}
-
-static void btree_node_data_free(struct bch_fs *c, struct btree *b)
-{
-	struct btree_cache *bc = &c->btree_cache;
-
-	EBUG_ON(btree_node_write_in_flight(b));
-
-	clear_btree_node_just_written(b);
-
-	kvfree(b->data);
-	b->data = NULL;
-#ifdef __KERNEL__
-	kvfree(b->aux_data);
-#else
-	munmap(b->aux_data, btree_aux_data_bytes(b));
-#endif
-	b->aux_data = NULL;
-
-	bc->used--;
-
-	btree_node_to_freedlist(bc, b);
-}
-
-static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
-				   const void *obj)
-{
-	const struct btree *b = obj;
-	const u64 *v = arg->key;
-
-	return b->hash_val == *v ? 0 : 1;
-}
-
-static const struct rhashtable_params bch_btree_cache_params = {
-	.head_offset	= offsetof(struct btree, hash),
-	.key_offset	= offsetof(struct btree, hash_val),
-	.key_len	= sizeof(u64),
-	.obj_cmpfn	= bch2_btree_cache_cmp_fn,
-};
-
-static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
-{
-	BUG_ON(b->data || b->aux_data);
-
-	b->data = kvmalloc(btree_buf_bytes(b), gfp);
-	if (!b->data)
-		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
-#ifdef __KERNEL__
-	b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
-#else
-	b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
-			   PROT_READ|PROT_WRITE|PROT_EXEC,
-			   MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
-	if (b->aux_data == MAP_FAILED)
-		b->aux_data = NULL;
-#endif
-	if (!b->aux_data) {
-		kvfree(b->data);
-		b->data = NULL;
-		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
-	}
-
-	return 0;
-}
-
-static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
-{
-	struct btree *b;
-
-	b = kzalloc(sizeof(struct btree), gfp);
-	if (!b)
-		return NULL;
-
-	bkey_btree_ptr_init(&b->key);
-	INIT_LIST_HEAD(&b->list);
-	INIT_LIST_HEAD(&b->write_blocked);
-	b->byte_order = ilog2(c->opts.btree_node_size);
-	return b;
-}
-
-struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
-{
-	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b;
-
-	b = __btree_node_mem_alloc(c, GFP_KERNEL);
-	if (!b)
-		return NULL;
-
-	if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
-		kfree(b);
-		return NULL;
-	}
-
-	bch2_btree_lock_init(&b->c, 0);
-
-	bc->used++;
-	list_add(&b->list, &bc->freeable);
-	return b;
-}
-
-/* Btree in memory cache - hash table */
-
-void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
-{
-	int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
-
-	BUG_ON(ret);
-
-	/* Cause future lookups for this node to fail: */
-	b->hash_val = 0;
-
-	if (b->c.btree_id < BTREE_ID_NR)
-		--bc->used_by_btree[b->c.btree_id];
-}
-
-int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
-{
-	BUG_ON(b->hash_val);
-	b->hash_val = btree_ptr_hash_val(&b->key);
-
-	int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash,
-						bch_btree_cache_params);
-	if (!ret && b->c.btree_id < BTREE_ID_NR)
-		bc->used_by_btree[b->c.btree_id]++;
-	return ret;
-}
-
-int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
-				unsigned level, enum btree_id id)
-{
-	int ret;
-
-	b->c.level	= level;
-	b->c.btree_id	= id;
-
-	mutex_lock(&bc->lock);
-	ret = __bch2_btree_node_hash_insert(bc, b);
-	if (!ret)
-		list_add_tail(&b->list, &bc->live);
-	mutex_unlock(&bc->lock);
-
-	return ret;
-}
-
-void bch2_btree_node_update_key_early(struct btree_trans *trans,
-				      enum btree_id btree, unsigned level,
-				      struct bkey_s_c old, struct bkey_i *new)
-{
-	struct bch_fs *c = trans->c;
-	struct btree *b;
-	struct bkey_buf tmp;
-	int ret;
-
-	bch2_bkey_buf_init(&tmp);
-	bch2_bkey_buf_reassemble(&tmp, c, old);
-
-	b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
-	if (!IS_ERR_OR_NULL(b)) {
-		mutex_lock(&c->btree_cache.lock);
-
-		bch2_btree_node_hash_remove(&c->btree_cache, b);
-
-		bkey_copy(&b->key, new);
-		ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
-		BUG_ON(ret);
-
-		mutex_unlock(&c->btree_cache.lock);
-		six_unlock_read(&b->c.lock);
-	}
-
-	bch2_bkey_buf_exit(&tmp, c);
-}
-
-__flatten
-static inline struct btree *btree_cache_find(struct btree_cache *bc,
-				     const struct bkey_i *k)
-{
-	u64 v = btree_ptr_hash_val(k);
-
-	return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
-}
-
-/*
- * this version is for btree nodes that have already been freed (we're not
- * reaping a real btree node)
- */
-static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
-{
-	struct btree_cache *bc = &c->btree_cache;
-	int ret = 0;
-
-	lockdep_assert_held(&bc->lock);
-
-	struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
-
-	u64 mask = b->c.level
-		? bc->pinned_nodes_interior_mask
-		: bc->pinned_nodes_leaf_mask;
-
-	if ((mask & BIT_ULL(b->c.btree_id)) &&
-	    bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
-	    bbpos_cmp(bc->pinned_nodes_end, pos) >= 0)
-		return -BCH_ERR_ENOMEM_btree_node_reclaim;
-
-wait_on_io:
-	if (b->flags & ((1U << BTREE_NODE_dirty)|
-			(1U << BTREE_NODE_read_in_flight)|
-			(1U << BTREE_NODE_write_in_flight))) {
-		if (!flush) {
-			if (btree_node_dirty(b))
-				BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
-			else if (btree_node_read_in_flight(b))
-				BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
-			else if (btree_node_write_in_flight(b))
-				BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
-			return -BCH_ERR_ENOMEM_btree_node_reclaim;
-		}
-
-		/* XXX: waiting on IO with btree cache lock held */
-		bch2_btree_node_wait_on_read(b);
-		bch2_btree_node_wait_on_write(b);
-	}
-
-	if (!six_trylock_intent(&b->c.lock)) {
-		BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
-		return -BCH_ERR_ENOMEM_btree_node_reclaim;
-	}
-
-	if (!six_trylock_write(&b->c.lock)) {
-		BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
-		goto out_unlock_intent;
-	}
-
-	/* recheck under lock */
-	if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
-			(1U << BTREE_NODE_write_in_flight))) {
-		if (!flush) {
-			if (btree_node_read_in_flight(b))
-				BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
-			else if (btree_node_write_in_flight(b))
-				BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
-			goto out_unlock;
-		}
-		six_unlock_write(&b->c.lock);
-		six_unlock_intent(&b->c.lock);
-		goto wait_on_io;
-	}
-
-	if (btree_node_noevict(b)) {
-		BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
-		goto out_unlock;
-	}
-	if (btree_node_write_blocked(b)) {
-		BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
-		goto out_unlock;
-	}
-	if (btree_node_will_make_reachable(b)) {
-		BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
-		goto out_unlock;
-	}
-
-	if (btree_node_dirty(b)) {
-		if (!flush) {
-			BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
-			goto out_unlock;
-		}
-		/*
-		 * Using the underscore version because we don't want to compact
-		 * bsets after the write, since this node is about to be evicted
-		 * - unless btree verify mode is enabled, since it runs out of
-		 * the post write cleanup:
-		 */
-		if (bch2_verify_btree_ondisk)
-			bch2_btree_node_write(c, b, SIX_LOCK_intent,
-					      BTREE_WRITE_cache_reclaim);
-		else
-			__bch2_btree_node_write(c, b,
-						BTREE_WRITE_cache_reclaim);
-
-		six_unlock_write(&b->c.lock);
-		six_unlock_intent(&b->c.lock);
-		goto wait_on_io;
-	}
-out:
-	if (b->hash_val && !ret)
-		trace_and_count(c, btree_cache_reap, c, b);
-	return ret;
-out_unlock:
-	six_unlock_write(&b->c.lock);
-out_unlock_intent:
-	six_unlock_intent(&b->c.lock);
-	ret = -BCH_ERR_ENOMEM_btree_node_reclaim;
-	goto out;
-}
-
-static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
-{
-	return __btree_node_reclaim(c, b, false, shrinker_counter);
-}
-
-static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
-{
-	return __btree_node_reclaim(c, b, true, false);
-}
-
-static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
-					   struct shrink_control *sc)
-{
-	struct bch_fs *c = shrink->private_data;
-	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b, *t;
-	unsigned long nr = sc->nr_to_scan;
-	unsigned long can_free = 0;
-	unsigned long freed = 0;
-	unsigned long touched = 0;
-	unsigned i, flags;
-	unsigned long ret = SHRINK_STOP;
-	bool trigger_writes = atomic_read(&bc->dirty) + nr >=
-		bc->used * 3 / 4;
-
-	if (bch2_btree_shrinker_disabled)
-		return SHRINK_STOP;
-
-	mutex_lock(&bc->lock);
-	flags = memalloc_nofs_save();
-
-	/*
-	 * It's _really_ critical that we don't free too many btree nodes - we
-	 * have to always leave ourselves a reserve. The reserve is how we
-	 * guarantee that allocating memory for a new btree node can always
-	 * succeed, so that inserting keys into the btree can always succeed and
-	 * IO can always make forward progress:
-	 */
-	can_free = btree_cache_can_free(bc);
-	nr = min_t(unsigned long, nr, can_free);
-
-	i = 0;
-	list_for_each_entry_safe(b, t, &bc->freeable, list) {
-		/*
-		 * Leave a few nodes on the freeable list, so that a btree split
-		 * won't have to hit the system allocator:
-		 */
-		if (++i <= 3)
-			continue;
-
-		touched++;
-
-		if (touched >= nr)
-			goto out;
-
-		if (!btree_node_reclaim(c, b, true)) {
-			btree_node_data_free(c, b);
-			six_unlock_write(&b->c.lock);
-			six_unlock_intent(&b->c.lock);
-			freed++;
-			bc->freed++;
-		}
-	}
-restart:
-	list_for_each_entry_safe(b, t, &bc->live, list) {
-		touched++;
-
-		if (btree_node_accessed(b)) {
-			clear_btree_node_accessed(b);
-			bc->not_freed_access_bit++;
-		} else if (!btree_node_reclaim(c, b, true)) {
-			freed++;
-			btree_node_data_free(c, b);
-			bc->freed++;
-
-			bch2_btree_node_hash_remove(bc, b);
-			six_unlock_write(&b->c.lock);
-			six_unlock_intent(&b->c.lock);
-
-			if (freed == nr)
-				goto out_rotate;
-		} else if (trigger_writes &&
-			   btree_node_dirty(b) &&
-			   !btree_node_will_make_reachable(b) &&
-			   !btree_node_write_blocked(b) &&
-			   six_trylock_read(&b->c.lock)) {
-			list_move(&bc->live, &b->list);
-			mutex_unlock(&bc->lock);
-			__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
-			six_unlock_read(&b->c.lock);
-			if (touched >= nr)
-				goto out_nounlock;
-			mutex_lock(&bc->lock);
-			goto restart;
-		}
-
-		if (touched >= nr)
-			break;
-	}
-out_rotate:
-	if (&t->list != &bc->live)
-		list_move_tail(&bc->live, &t->list);
-out:
-	mutex_unlock(&bc->lock);
-out_nounlock:
-	ret = freed;
-	memalloc_nofs_restore(flags);
-	trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
-	return ret;
-}
-
-static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
-					    struct shrink_control *sc)
-{
-	struct bch_fs *c = shrink->private_data;
-	struct btree_cache *bc = &c->btree_cache;
-
-	if (bch2_btree_shrinker_disabled)
-		return 0;
-
-	return btree_cache_can_free(bc);
-}
-
-void bch2_fs_btree_cache_exit(struct bch_fs *c)
-{
-	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b;
-	unsigned i, flags;
-
-	shrinker_free(bc->shrink);
-
-	/* vfree() can allocate memory: */
-	flags = memalloc_nofs_save();
-	mutex_lock(&bc->lock);
-
-	if (c->verify_data)
-		list_move(&c->verify_data->list, &bc->live);
-
-	kvfree(c->verify_ondisk);
-
-	for (i = 0; i < btree_id_nr_alive(c); i++) {
-		struct btree_root *r = bch2_btree_id_root(c, i);
-
-		if (r->b)
-			list_add(&r->b->list, &bc->live);
-	}
-
-	list_splice(&bc->freeable, &bc->live);
-
-	while (!list_empty(&bc->live)) {
-		b = list_first_entry(&bc->live, struct btree, list);
-
-		BUG_ON(btree_node_read_in_flight(b) ||
-		       btree_node_write_in_flight(b));
-
-		btree_node_data_free(c, b);
-	}
-
-	BUG_ON(!bch2_journal_error(&c->journal) &&
-	       atomic_read(&c->btree_cache.dirty));
-
-	list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
-
-	while (!list_empty(&bc->freed_nonpcpu)) {
-		b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
-		list_del(&b->list);
-		six_lock_exit(&b->c.lock);
-		kfree(b);
-	}
-
-	mutex_unlock(&bc->lock);
-	memalloc_nofs_restore(flags);
-
-	if (bc->table_init_done)
-		rhashtable_destroy(&bc->table);
-}
-
-int bch2_fs_btree_cache_init(struct bch_fs *c)
-{
-	struct btree_cache *bc = &c->btree_cache;
-	struct shrinker *shrink;
-	unsigned i;
-	int ret = 0;
-
-	ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
-	if (ret)
-		goto err;
-
-	bc->table_init_done = true;
-
-	bch2_recalc_btree_reserve(c);
-
-	for (i = 0; i < bc->reserve; i++)
-		if (!__bch2_btree_node_mem_alloc(c))
-			goto err;
-
-	list_splice_init(&bc->live, &bc->freeable);
-
-	mutex_init(&c->verify_lock);
-
-	shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
-	if (!shrink)
-		goto err;
-	bc->shrink = shrink;
-	shrink->count_objects	= bch2_btree_cache_count;
-	shrink->scan_objects	= bch2_btree_cache_scan;
-	shrink->seeks		= 4;
-	shrink->private_data	= c;
-	shrinker_register(shrink);
-
-	return 0;
-err:
-	return -BCH_ERR_ENOMEM_fs_btree_cache_init;
-}
-
-void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
-{
-	mutex_init(&bc->lock);
-	INIT_LIST_HEAD(&bc->live);
-	INIT_LIST_HEAD(&bc->freeable);
-	INIT_LIST_HEAD(&bc->freed_pcpu);
-	INIT_LIST_HEAD(&bc->freed_nonpcpu);
-}
-
-/*
- * We can only have one thread cannibalizing other cached btree nodes at a time,
- * or we'll deadlock. We use an open coded mutex to ensure that, which a
- * cannibalize_bucket() will take. This means every time we unlock the root of
- * the btree, we need to release this lock if we have it held.
- */
-void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_cache *bc = &c->btree_cache;
-
-	if (bc->alloc_lock == current) {
-		trace_and_count(c, btree_cache_cannibalize_unlock, trans);
-		bc->alloc_lock = NULL;
-		closure_wake_up(&bc->alloc_wait);
-	}
-}
-
-int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_cache *bc = &c->btree_cache;
-	struct task_struct *old;
-
-	old = cmpxchg(&bc->alloc_lock, NULL, current);
-	if (old == NULL || old == current)
-		goto success;
-
-	if (!cl) {
-		trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
-		return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
-	}
-
-	closure_wait(&bc->alloc_wait, cl);
-
-	/* Try again, after adding ourselves to waitlist */
-	old = cmpxchg(&bc->alloc_lock, NULL, current);
-	if (old == NULL || old == current) {
-		/* We raced */
-		closure_wake_up(&bc->alloc_wait);
-		goto success;
-	}
-
-	trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
-	return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
-
-success:
-	trace_and_count(c, btree_cache_cannibalize_lock, trans);
-	return 0;
-}
-
-static struct btree *btree_node_cannibalize(struct bch_fs *c)
-{
-	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b;
-
-	list_for_each_entry_reverse(b, &bc->live, list)
-		if (!btree_node_reclaim(c, b, false))
-			return b;
-
-	while (1) {
-		list_for_each_entry_reverse(b, &bc->live, list)
-			if (!btree_node_write_and_reclaim(c, b))
-				return b;
-
-		/*
-		 * Rare case: all nodes were intent-locked.
-		 * Just busy-wait.
-		 */
-		WARN_ONCE(1, "btree cache cannibalize failed\n");
-		cond_resched();
-	}
-}
-
-struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_cache *bc = &c->btree_cache;
-	struct list_head *freed = pcpu_read_locks
-		? &bc->freed_pcpu
-		: &bc->freed_nonpcpu;
-	struct btree *b, *b2;
-	u64 start_time = local_clock();
-	unsigned flags;
-
-	flags = memalloc_nofs_save();
-	mutex_lock(&bc->lock);
-
-	/*
-	 * We never free struct btree itself, just the memory that holds the on
-	 * disk node. Check the freed list before allocating a new one:
-	 */
-	list_for_each_entry(b, freed, list)
-		if (!btree_node_reclaim(c, b, false)) {
-			list_del_init(&b->list);
-			goto got_node;
-		}
-
-	b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
-	if (!b) {
-		mutex_unlock(&bc->lock);
-		bch2_trans_unlock(trans);
-		b = __btree_node_mem_alloc(c, GFP_KERNEL);
-		if (!b)
-			goto err;
-		mutex_lock(&bc->lock);
-	}
-
-	bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0);
-
-	BUG_ON(!six_trylock_intent(&b->c.lock));
-	BUG_ON(!six_trylock_write(&b->c.lock));
-got_node:
-
-	/*
-	 * btree_free() doesn't free memory; it sticks the node on the end of
-	 * the list. Check if there's any freed nodes there:
-	 */
-	list_for_each_entry(b2, &bc->freeable, list)
-		if (!btree_node_reclaim(c, b2, false)) {
-			swap(b->data, b2->data);
-			swap(b->aux_data, b2->aux_data);
-			btree_node_to_freedlist(bc, b2);
-			six_unlock_write(&b2->c.lock);
-			six_unlock_intent(&b2->c.lock);
-			goto got_mem;
-		}
-
-	mutex_unlock(&bc->lock);
-
-	if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) {
-		bch2_trans_unlock(trans);
-		if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))
-			goto err;
-	}
-
-	mutex_lock(&bc->lock);
-	bc->used++;
-got_mem:
-	mutex_unlock(&bc->lock);
-
-	BUG_ON(btree_node_hashed(b));
-	BUG_ON(btree_node_dirty(b));
-	BUG_ON(btree_node_write_in_flight(b));
-out:
-	b->flags		= 0;
-	b->written		= 0;
-	b->nsets		= 0;
-	b->sib_u64s[0]		= 0;
-	b->sib_u64s[1]		= 0;
-	b->whiteout_u64s	= 0;
-	bch2_btree_keys_init(b);
-	set_btree_node_accessed(b);
-
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
-			       start_time);
-
-	memalloc_nofs_restore(flags);
-	return b;
-err:
-	mutex_lock(&bc->lock);
-
-	/* Try to cannibalize another cached btree node: */
-	if (bc->alloc_lock == current) {
-		b2 = btree_node_cannibalize(c);
-		clear_btree_node_just_written(b2);
-		bch2_btree_node_hash_remove(bc, b2);
-
-		if (b) {
-			swap(b->data, b2->data);
-			swap(b->aux_data, b2->aux_data);
-			btree_node_to_freedlist(bc, b2);
-			six_unlock_write(&b2->c.lock);
-			six_unlock_intent(&b2->c.lock);
-		} else {
-			b = b2;
-			list_del_init(&b->list);
-		}
-
-		mutex_unlock(&bc->lock);
-
-		trace_and_count(c, btree_cache_cannibalize, trans);
-		goto out;
-	}
-
-	mutex_unlock(&bc->lock);
-	memalloc_nofs_restore(flags);
-	return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
-}
-
-/* Slowpath, don't want it inlined into btree_iter_traverse() */
-static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
-				struct btree_path *path,
-				const struct bkey_i *k,
-				enum btree_id btree_id,
-				unsigned level,
-				enum six_lock_type lock_type,
-				bool sync)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b;
-
-	if (unlikely(level >= BTREE_MAX_DEPTH)) {
-		int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u",
-						 level, BTREE_MAX_DEPTH);
-		return ERR_PTR(ret);
-	}
-
-	if (unlikely(!bkey_is_btree_ptr(&k->k))) {
-		struct printbuf buf = PRINTBUF;
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
-
-		int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf);
-		printbuf_exit(&buf);
-		return ERR_PTR(ret);
-	}
-
-	if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) {
-		struct printbuf buf = PRINTBUF;
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
-
-		int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf);
-		printbuf_exit(&buf);
-		return ERR_PTR(ret);
-	}
-
-	/*
-	 * Parent node must be locked, else we could read in a btree node that's
-	 * been freed:
-	 */
-	if (path && !bch2_btree_node_relock(trans, path, level + 1)) {
-		trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
-		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
-	}
-
-	b = bch2_btree_node_mem_alloc(trans, level != 0);
-
-	if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
-		if (!path)
-			return b;
-
-		trans->memory_allocation_failure = true;
-		trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
-		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
-	}
-
-	if (IS_ERR(b))
-		return b;
-
-	bkey_copy(&b->key, k);
-	if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
-		/* raced with another fill: */
-
-		/* mark as unhashed... */
-		b->hash_val = 0;
-
-		mutex_lock(&bc->lock);
-		list_add(&b->list, &bc->freeable);
-		mutex_unlock(&bc->lock);
-
-		six_unlock_write(&b->c.lock);
-		six_unlock_intent(&b->c.lock);
-		return NULL;
-	}
-
-	set_btree_node_read_in_flight(b);
-	six_unlock_write(&b->c.lock);
-
-	if (path) {
-		u32 seq = six_lock_seq(&b->c.lock);
-
-		/* Unlock before doing IO: */
-		six_unlock_intent(&b->c.lock);
-		bch2_trans_unlock_noassert(trans);
-
-		bch2_btree_node_read(trans, b, sync);
-
-		if (!sync)
-			return NULL;
-
-		if (!six_relock_type(&b->c.lock, lock_type, seq))
-			b = NULL;
-	} else {
-		bch2_btree_node_read(trans, b, sync);
-		if (lock_type == SIX_LOCK_read)
-			six_lock_downgrade(&b->c.lock);
-	}
-
-	return b;
-}
-
-static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
-{
-	struct printbuf buf = PRINTBUF;
-
-	if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
-		return;
-
-	prt_printf(&buf,
-	       "btree node header doesn't match ptr\n"
-	       "btree %s level %u\n"
-	       "ptr: ",
-	       bch2_btree_id_str(b->c.btree_id), b->c.level);
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
-	prt_printf(&buf, "\nheader: btree %s level %llu\n"
-	       "min ",
-	       bch2_btree_id_str(BTREE_NODE_ID(b->data)),
-	       BTREE_NODE_LEVEL(b->data));
-	bch2_bpos_to_text(&buf, b->data->min_key);
-
-	prt_printf(&buf, "\nmax ");
-	bch2_bpos_to_text(&buf, b->data->max_key);
-
-	bch2_fs_topology_error(c, "%s", buf.buf);
-
-	printbuf_exit(&buf);
-}
-
-static inline void btree_check_header(struct bch_fs *c, struct btree *b)
-{
-	if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
-	    b->c.level != BTREE_NODE_LEVEL(b->data) ||
-	    !bpos_eq(b->data->max_key, b->key.k.p) ||
-	    (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-	     !bpos_eq(b->data->min_key,
-		      bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
-		btree_bad_header(c, b);
-}
-
-static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
-					   const struct bkey_i *k, unsigned level,
-					   enum six_lock_type lock_type,
-					   unsigned long trace_ip)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b;
-	bool need_relock = false;
-	int ret;
-
-	EBUG_ON(level >= BTREE_MAX_DEPTH);
-retry:
-	b = btree_cache_find(bc, k);
-	if (unlikely(!b)) {
-		/*
-		 * We must have the parent locked to call bch2_btree_node_fill(),
-		 * else we could read in a btree node from disk that's been
-		 * freed:
-		 */
-		b = bch2_btree_node_fill(trans, path, k, path->btree_id,
-					 level, lock_type, true);
-		need_relock = true;
-
-		/* We raced and found the btree node in the cache */
-		if (!b)
-			goto retry;
-
-		if (IS_ERR(b))
-			return b;
-	} else {
-		if (btree_node_read_locked(path, level + 1))
-			btree_node_unlock(trans, path, level + 1);
-
-		ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			return ERR_PTR(ret);
-
-		BUG_ON(ret);
-
-		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
-			     b->c.level != level ||
-			     race_fault())) {
-			six_unlock_type(&b->c.lock, lock_type);
-			if (bch2_btree_node_relock(trans, path, level + 1))
-				goto retry;
-
-			trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
-			return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
-		}
-
-		/* avoid atomic set bit if it's not needed: */
-		if (!btree_node_accessed(b))
-			set_btree_node_accessed(b);
-	}
-
-	if (unlikely(btree_node_read_in_flight(b))) {
-		u32 seq = six_lock_seq(&b->c.lock);
-
-		six_unlock_type(&b->c.lock, lock_type);
-		bch2_trans_unlock(trans);
-		need_relock = true;
-
-		bch2_btree_node_wait_on_read(b);
-
-		/*
-		 * should_be_locked is not set on this path yet, so we need to
-		 * relock it specifically:
-		 */
-		if (!six_relock_type(&b->c.lock, lock_type, seq))
-			goto retry;
-	}
-
-	if (unlikely(need_relock)) {
-		ret = bch2_trans_relock(trans) ?:
-			bch2_btree_path_relock_intent(trans, path);
-		if (ret) {
-			six_unlock_type(&b->c.lock, lock_type);
-			return ERR_PTR(ret);
-		}
-	}
-
-	prefetch(b->aux_data);
-
-	for_each_bset(b, t) {
-		void *p = (u64 *) b->aux_data + t->aux_data_offset;
-
-		prefetch(p + L1_CACHE_BYTES * 0);
-		prefetch(p + L1_CACHE_BYTES * 1);
-		prefetch(p + L1_CACHE_BYTES * 2);
-	}
-
-	if (unlikely(btree_node_read_error(b))) {
-		six_unlock_type(&b->c.lock, lock_type);
-		return ERR_PTR(-BCH_ERR_btree_node_read_error);
-	}
-
-	EBUG_ON(b->c.btree_id != path->btree_id);
-	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
-	btree_check_header(c, b);
-
-	return b;
-}
-
-/**
- * bch2_btree_node_get - find a btree node in the cache and lock it, reading it
- * in from disk if necessary.
- *
- * @trans:	btree transaction object
- * @path:	btree_path being traversed
- * @k:		pointer to btree node (generally KEY_TYPE_btree_ptr_v2)
- * @level:	level of btree node being looked up (0 == leaf node)
- * @lock_type:	SIX_LOCK_read or SIX_LOCK_intent
- * @trace_ip:	ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek())
- *
- * The btree node will have either a read or a write lock held, depending on
- * the @write parameter.
- *
- * Returns: btree node or ERR_PTR()
- */
-struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
-				  const struct bkey_i *k, unsigned level,
-				  enum six_lock_type lock_type,
-				  unsigned long trace_ip)
-{
-	struct bch_fs *c = trans->c;
-	struct btree *b;
-	int ret;
-
-	EBUG_ON(level >= BTREE_MAX_DEPTH);
-
-	b = btree_node_mem_ptr(k);
-
-	/*
-	 * Check b->hash_val _before_ calling btree_node_lock() - this might not
-	 * be the node we want anymore, and trying to lock the wrong node could
-	 * cause an unneccessary transaction restart:
-	 */
-	if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
-		     !b ||
-		     b->hash_val != btree_ptr_hash_val(k)))
-		return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
-
-	if (btree_node_read_locked(path, level + 1))
-		btree_node_unlock(trans, path, level + 1);
-
-	ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		return ERR_PTR(ret);
-
-	BUG_ON(ret);
-
-	if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
-		     b->c.level != level ||
-		     race_fault())) {
-		six_unlock_type(&b->c.lock, lock_type);
-		if (bch2_btree_node_relock(trans, path, level + 1))
-			return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
-
-		trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
-		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
-	}
-
-	if (unlikely(btree_node_read_in_flight(b))) {
-		six_unlock_type(&b->c.lock, lock_type);
-		return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
-	}
-
-	prefetch(b->aux_data);
-
-	for_each_bset(b, t) {
-		void *p = (u64 *) b->aux_data + t->aux_data_offset;
-
-		prefetch(p + L1_CACHE_BYTES * 0);
-		prefetch(p + L1_CACHE_BYTES * 1);
-		prefetch(p + L1_CACHE_BYTES * 2);
-	}
-
-	/* avoid atomic set bit if it's not needed: */
-	if (!btree_node_accessed(b))
-		set_btree_node_accessed(b);
-
-	if (unlikely(btree_node_read_error(b))) {
-		six_unlock_type(&b->c.lock, lock_type);
-		return ERR_PTR(-BCH_ERR_btree_node_read_error);
-	}
-
-	EBUG_ON(b->c.btree_id != path->btree_id);
-	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
-	btree_check_header(c, b);
-
-	return b;
-}
-
-struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
-					 const struct bkey_i *k,
-					 enum btree_id btree_id,
-					 unsigned level,
-					 bool nofill)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b;
-	int ret;
-
-	EBUG_ON(level >= BTREE_MAX_DEPTH);
-
-	if (c->opts.btree_node_mem_ptr_optimization) {
-		b = btree_node_mem_ptr(k);
-		if (b)
-			goto lock_node;
-	}
-retry:
-	b = btree_cache_find(bc, k);
-	if (unlikely(!b)) {
-		if (nofill)
-			goto out;
-
-		b = bch2_btree_node_fill(trans, NULL, k, btree_id,
-					 level, SIX_LOCK_read, true);
-
-		/* We raced and found the btree node in the cache */
-		if (!b)
-			goto retry;
-
-		if (IS_ERR(b) &&
-		    !bch2_btree_cache_cannibalize_lock(trans, NULL))
-			goto retry;
-
-		if (IS_ERR(b))
-			goto out;
-	} else {
-lock_node:
-		ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_);
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			return ERR_PTR(ret);
-
-		BUG_ON(ret);
-
-		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
-			     b->c.btree_id != btree_id ||
-			     b->c.level != level)) {
-			six_unlock_read(&b->c.lock);
-			goto retry;
-		}
-	}
-
-	/* XXX: waiting on IO with btree locks held: */
-	__bch2_btree_node_wait_on_read(b);
-
-	prefetch(b->aux_data);
-
-	for_each_bset(b, t) {
-		void *p = (u64 *) b->aux_data + t->aux_data_offset;
-
-		prefetch(p + L1_CACHE_BYTES * 0);
-		prefetch(p + L1_CACHE_BYTES * 1);
-		prefetch(p + L1_CACHE_BYTES * 2);
-	}
-
-	/* avoid atomic set bit if it's not needed: */
-	if (!btree_node_accessed(b))
-		set_btree_node_accessed(b);
-
-	if (unlikely(btree_node_read_error(b))) {
-		six_unlock_read(&b->c.lock);
-		b = ERR_PTR(-BCH_ERR_btree_node_read_error);
-		goto out;
-	}
-
-	EBUG_ON(b->c.btree_id != btree_id);
-	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
-	btree_check_header(c, b);
-out:
-	bch2_btree_cache_cannibalize_unlock(trans);
-	return b;
-}
-
-int bch2_btree_node_prefetch(struct btree_trans *trans,
-			     struct btree_path *path,
-			     const struct bkey_i *k,
-			     enum btree_id btree_id, unsigned level)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_cache *bc = &c->btree_cache;
-
-	BUG_ON(path && !btree_node_locked(path, level + 1));
-	BUG_ON(level >= BTREE_MAX_DEPTH);
-
-	struct btree *b = btree_cache_find(bc, k);
-	if (b)
-		return 0;
-
-	b = bch2_btree_node_fill(trans, path, k, btree_id,
-				 level, SIX_LOCK_read, false);
-	if (!IS_ERR_OR_NULL(b))
-		six_unlock_read(&b->c.lock);
-	return bch2_trans_relock(trans) ?: PTR_ERR_OR_ZERO(b);
-}
-
-void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b;
-
-	b = btree_cache_find(bc, k);
-	if (!b)
-		return;
-
-	BUG_ON(b == btree_node_root(trans->c, b));
-wait_on_io:
-	/* not allowed to wait on io with btree locks held: */
-
-	/* XXX we're called from btree_gc which will be holding other btree
-	 * nodes locked
-	 */
-	__bch2_btree_node_wait_on_read(b);
-	__bch2_btree_node_wait_on_write(b);
-
-	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
-	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
-	if (unlikely(b->hash_val != btree_ptr_hash_val(k)))
-		goto out;
-
-	if (btree_node_dirty(b)) {
-		__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
-		six_unlock_write(&b->c.lock);
-		six_unlock_intent(&b->c.lock);
-		goto wait_on_io;
-	}
-
-	BUG_ON(btree_node_dirty(b));
-
-	mutex_lock(&bc->lock);
-	btree_node_data_free(c, b);
-	bch2_btree_node_hash_remove(bc, b);
-	mutex_unlock(&bc->lock);
-out:
-	six_unlock_write(&b->c.lock);
-	six_unlock_intent(&b->c.lock);
-}
-
-const char *bch2_btree_id_str(enum btree_id btree)
-{
-	return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)";
-}
-
-void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
-{
-	prt_printf(out, "%s level %u/%u\n  ",
-	       bch2_btree_id_str(b->c.btree_id),
-	       b->c.level,
-	       bch2_btree_id_root(c, b->c.btree_id)->level);
-	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
-}
-
-void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
-{
-	struct bset_stats stats;
-
-	memset(&stats, 0, sizeof(stats));
-
-	bch2_btree_keys_stats(b, &stats);
-
-	prt_printf(out, "l %u ", b->c.level);
-	bch2_bpos_to_text(out, b->data->min_key);
-	prt_printf(out, " - ");
-	bch2_bpos_to_text(out, b->data->max_key);
-	prt_printf(out, ":\n"
-	       "    ptrs: ");
-	bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
-	prt_newline(out);
-
-	prt_printf(out,
-	       "    format: ");
-	bch2_bkey_format_to_text(out, &b->format);
-
-	prt_printf(out,
-	       "    unpack fn len: %u\n"
-	       "    bytes used %zu/%zu (%zu%% full)\n"
-	       "    sib u64s: %u, %u (merge threshold %u)\n"
-	       "    nr packed keys %u\n"
-	       "    nr unpacked keys %u\n"
-	       "    floats %zu\n"
-	       "    failed unpacked %zu\n",
-	       b->unpack_fn_len,
-	       b->nr.live_u64s * sizeof(u64),
-	       btree_buf_bytes(b) - sizeof(struct btree_node),
-	       b->nr.live_u64s * 100 / btree_max_u64s(c),
-	       b->sib_u64s[0],
-	       b->sib_u64s[1],
-	       c->btree_foreground_merge_threshold,
-	       b->nr.packed_keys,
-	       b->nr.unpacked_keys,
-	       stats.floats,
-	       stats.failed);
-}
-
-static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c,
-				 const char *label, unsigned nr)
-{
-	prt_printf(out, "%s\t", label);
-	prt_human_readable_u64(out, nr * c->opts.btree_node_size);
-	prt_printf(out, " (%u)\n", nr);
-}
-
-void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
-{
-	struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
-
-	if (!out->nr_tabstops)
-		printbuf_tabstop_push(out, 32);
-
-	prt_btree_cache_line(out, c, "total:",		bc->used);
-	prt_btree_cache_line(out, c, "nr dirty:",	atomic_read(&bc->dirty));
-	prt_printf(out, "cannibalize lock:\t%p\n",	bc->alloc_lock);
-	prt_newline(out);
-
-	for (unsigned i = 0; i < ARRAY_SIZE(bc->used_by_btree); i++)
-		prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->used_by_btree[i]);
-
-	prt_newline(out);
-	prt_printf(out, "freed:\t%u\n", bc->freed);
-	prt_printf(out, "not freed:\n");
-	prt_printf(out, "  dirty\t%u\n", bc->not_freed_dirty);
-	prt_printf(out, "  write in flight\t%u\n", bc->not_freed_write_in_flight);
-	prt_printf(out, "  read in flight\t%u\n", bc->not_freed_read_in_flight);
-	prt_printf(out, "  lock intent failed\t%u\n", bc->not_freed_lock_intent);
-	prt_printf(out, "  lock write failed\t%u\n", bc->not_freed_lock_write);
-	prt_printf(out, "  access bit\t%u\n", bc->not_freed_access_bit);
-	prt_printf(out, "  no evict failed\t%u\n", bc->not_freed_noevict);
-	prt_printf(out, "  write blocked\t%u\n", bc->not_freed_write_blocked);
-	prt_printf(out, "  will make reachable\t%u\n", bc->not_freed_will_make_reachable);
-}
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
deleted file mode 100644
index fed35de3e4de..000000000000
--- a/fs/bcachefs/btree_cache.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_CACHE_H
-#define _BCACHEFS_BTREE_CACHE_H
-
-#include "bcachefs.h"
-#include "btree_types.h"
-#include "bkey_methods.h"
-
-extern const char * const bch2_btree_node_flags[];
-
-struct btree_iter;
-
-void bch2_recalc_btree_reserve(struct bch_fs *);
-
-void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
-int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
-int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
-				unsigned, enum btree_id);
-
-void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned,
-				      struct bkey_s_c, struct bkey_i *);
-
-void bch2_btree_cache_cannibalize_unlock(struct btree_trans *);
-int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *);
-
-struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
-struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool);
-
-struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
-				  const struct bkey_i *, unsigned,
-				  enum six_lock_type, unsigned long);
-
-struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *,
-					 enum btree_id, unsigned, bool);
-
-int bch2_btree_node_prefetch(struct btree_trans *, struct btree_path *,
-			     const struct bkey_i *, enum btree_id, unsigned);
-
-void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *);
-
-void bch2_fs_btree_cache_exit(struct bch_fs *);
-int bch2_fs_btree_cache_init(struct bch_fs *);
-void bch2_fs_btree_cache_init_early(struct btree_cache *);
-
-static inline u64 btree_ptr_hash_val(const struct bkey_i *k)
-{
-	switch (k->k.type) {
-	case KEY_TYPE_btree_ptr:
-		return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start);
-	case KEY_TYPE_btree_ptr_v2:
-		/*
-		 * The cast/deref is only necessary to avoid sparse endianness
-		 * warnings:
-		 */
-		return *((u64 *) &bkey_i_to_btree_ptr_v2_c(k)->v.seq);
-	default:
-		return 0;
-	}
-}
-
-static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k)
-{
-	return k->k.type == KEY_TYPE_btree_ptr_v2
-		? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr
-		: NULL;
-}
-
-/* is btree node in hash table? */
-static inline bool btree_node_hashed(struct btree *b)
-{
-	return b->hash_val != 0;
-}
-
-#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos)		\
-	for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl,	\
-					  &(_c)->btree_cache.table),	\
-	     _iter = 0;	_iter < (_tbl)->size; _iter++)			\
-		rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
-
-static inline size_t btree_buf_bytes(const struct btree *b)
-{
-	return 1UL << b->byte_order;
-}
-
-static inline size_t btree_buf_max_u64s(const struct btree *b)
-{
-	return (btree_buf_bytes(b) - sizeof(struct btree_node)) / sizeof(u64);
-}
-
-static inline size_t btree_max_u64s(const struct bch_fs *c)
-{
-	return (c->opts.btree_node_size - sizeof(struct btree_node)) / sizeof(u64);
-}
-
-static inline size_t btree_sectors(const struct bch_fs *c)
-{
-	return c->opts.btree_node_size >> SECTOR_SHIFT;
-}
-
-static inline unsigned btree_blocks(const struct bch_fs *c)
-{
-	return btree_sectors(c) >> c->block_bits;
-}
-
-#define BTREE_SPLIT_THRESHOLD(c)		(btree_max_u64s(c) * 2 / 3)
-
-#define BTREE_FOREGROUND_MERGE_THRESHOLD(c)	(btree_max_u64s(c) * 1 / 3)
-#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c)			\
-	(BTREE_FOREGROUND_MERGE_THRESHOLD(c) +			\
-	 (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2))
-
-static inline unsigned btree_id_nr_alive(struct bch_fs *c)
-{
-	return BTREE_ID_NR + c->btree_roots_extra.nr;
-}
-
-static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned id)
-{
-	if (likely(id < BTREE_ID_NR)) {
-		return &c->btree_roots_known[id];
-	} else {
-		unsigned idx = id - BTREE_ID_NR;
-
-		EBUG_ON(idx >= c->btree_roots_extra.nr);
-		return &c->btree_roots_extra.data[idx];
-	}
-}
-
-static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b)
-{
-	return bch2_btree_id_root(c, b->c.btree_id)->b;
-}
-
-const char *bch2_btree_id_str(enum btree_id);
-void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
-void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
-void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *);
-
-#endif /* _BCACHEFS_BTREE_CACHE_H */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
deleted file mode 100644
index 8035c8b797ab..000000000000
--- a/fs/bcachefs/btree_gc.c
+++ /dev/null
@@ -1,1429 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright (C) 2014 Datera Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "bkey_methods.h"
-#include "bkey_buf.h"
-#include "btree_journal_iter.h"
-#include "btree_key_cache.h"
-#include "btree_locking.h"
-#include "btree_node_scan.h"
-#include "btree_update_interior.h"
-#include "btree_io.h"
-#include "btree_gc.h"
-#include "buckets.h"
-#include "clock.h"
-#include "debug.h"
-#include "ec.h"
-#include "error.h"
-#include "extents.h"
-#include "journal.h"
-#include "keylist.h"
-#include "move.h"
-#include "recovery_passes.h"
-#include "reflink.h"
-#include "replicas.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/slab.h>
-#include <linux/bitops.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/preempt.h>
-#include <linux/rcupdate.h>
-#include <linux/sched/task.h>
-
-#define DROP_THIS_NODE		10
-#define DROP_PREV_NODE		11
-#define DID_FILL_FROM_SCAN	12
-
-static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
-{
-	return (struct bkey_s) {{{
-		(struct bkey *) k.k,
-		(struct bch_val *) k.v
-	}}};
-}
-
-static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
-{
-	preempt_disable();
-	write_seqcount_begin(&c->gc_pos_lock);
-	c->gc_pos = new_pos;
-	write_seqcount_end(&c->gc_pos_lock);
-	preempt_enable();
-}
-
-static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
-{
-	BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) < 0);
-	__gc_pos_set(c, new_pos);
-}
-
-static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
-{
-	switch (b->key.k.type) {
-	case KEY_TYPE_btree_ptr: {
-		struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key);
-
-		dst->k.p		= src->k.p;
-		dst->v.mem_ptr		= 0;
-		dst->v.seq		= b->data->keys.seq;
-		dst->v.sectors_written	= 0;
-		dst->v.flags		= 0;
-		dst->v.min_key		= b->data->min_key;
-		set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k));
-		memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k));
-		break;
-	}
-	case KEY_TYPE_btree_ptr_v2:
-		bkey_copy(&dst->k_i, &b->key);
-		break;
-	default:
-		BUG();
-	}
-}
-
-static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
-{
-	struct bkey_i_btree_ptr_v2 *new;
-	int ret;
-
-	if (c->opts.verbose) {
-		struct printbuf buf = PRINTBUF;
-
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-		prt_str(&buf, " -> ");
-		bch2_bpos_to_text(&buf, new_min);
-
-		bch_info(c, "%s(): %s", __func__, buf.buf);
-		printbuf_exit(&buf);
-	}
-
-	new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
-	if (!new)
-		return -BCH_ERR_ENOMEM_gc_repair_key;
-
-	btree_ptr_to_v2(b, new);
-	b->data->min_key	= new_min;
-	new->v.min_key		= new_min;
-	SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
-
-	ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
-	if (ret) {
-		kfree(new);
-		return ret;
-	}
-
-	bch2_btree_node_drop_keys_outside_node(b);
-	bkey_copy(&b->key, &new->k_i);
-	return 0;
-}
-
-static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
-{
-	struct bkey_i_btree_ptr_v2 *new;
-	int ret;
-
-	if (c->opts.verbose) {
-		struct printbuf buf = PRINTBUF;
-
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-		prt_str(&buf, " -> ");
-		bch2_bpos_to_text(&buf, new_max);
-
-		bch_info(c, "%s(): %s", __func__, buf.buf);
-		printbuf_exit(&buf);
-	}
-
-	ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
-	if (ret)
-		return ret;
-
-	new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
-	if (!new)
-		return -BCH_ERR_ENOMEM_gc_repair_key;
-
-	btree_ptr_to_v2(b, new);
-	b->data->max_key	= new_max;
-	new->k.p		= new_max;
-	SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
-
-	ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
-	if (ret) {
-		kfree(new);
-		return ret;
-	}
-
-	bch2_btree_node_drop_keys_outside_node(b);
-
-	mutex_lock(&c->btree_cache.lock);
-	bch2_btree_node_hash_remove(&c->btree_cache, b);
-
-	bkey_copy(&b->key, &new->k_i);
-	ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
-	BUG_ON(ret);
-	mutex_unlock(&c->btree_cache.lock);
-	return 0;
-}
-
-static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b,
-				       struct btree *prev, struct btree *cur,
-				       struct bpos *pulled_from_scan)
-{
-	struct bpos expected_start = !prev
-		? b->data->min_key
-		: bpos_successor(prev->key.k.p);
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-	       !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
-			b->data->min_key));
-
-	if (bpos_eq(expected_start, cur->data->min_key))
-		return 0;
-
-	prt_printf(&buf, "  at btree %s level %u:\n  parent: ",
-		   bch2_btree_id_str(b->c.btree_id), b->c.level);
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
-	if (prev) {
-		prt_printf(&buf, "\n  prev: ");
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key));
-	}
-
-	prt_str(&buf, "\n  next: ");
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key));
-
-	if (bpos_lt(expected_start, cur->data->min_key)) {				/* gap */
-		if (b->c.level == 1 &&
-		    bpos_lt(*pulled_from_scan, cur->data->min_key)) {
-			ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
-						     expected_start,
-						     bpos_predecessor(cur->data->min_key));
-			if (ret)
-				goto err;
-
-			*pulled_from_scan = cur->data->min_key;
-			ret = DID_FILL_FROM_SCAN;
-		} else {
-			if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
-					     "btree node with incorrect min_key%s", buf.buf))
-				ret = set_node_min(c, cur, expected_start);
-		}
-	} else {									/* overlap */
-		if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {	/* cur overwrites prev */
-			if (bpos_ge(prev->data->min_key, cur->data->min_key)) {		/* fully? */
-				if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_next_node,
-						     "btree node overwritten by next node%s", buf.buf))
-					ret = DROP_PREV_NODE;
-			} else {
-				if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
-						     "btree node with incorrect max_key%s", buf.buf))
-					ret = set_node_max(c, prev,
-							   bpos_predecessor(cur->data->min_key));
-			}
-		} else {
-			if (bpos_ge(expected_start, cur->data->max_key)) {		/* fully? */
-				if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_prev_node,
-						     "btree node overwritten by prev node%s", buf.buf))
-					ret = DROP_THIS_NODE;
-			} else {
-				if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
-						     "btree node with incorrect min_key%s", buf.buf))
-					ret = set_node_min(c, cur, expected_start);
-			}
-		}
-	}
-err:
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
-				 struct btree *child, struct bpos *pulled_from_scan)
-{
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	if (bpos_eq(child->key.k.p, b->key.k.p))
-		return 0;
-
-	prt_printf(&buf, "at btree %s level %u:\n  parent: ",
-		   bch2_btree_id_str(b->c.btree_id), b->c.level);
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
-	prt_str(&buf, "\n  child: ");
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
-
-	if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
-			     "btree node with incorrect max_key%s", buf.buf)) {
-		if (b->c.level == 1 &&
-		    bpos_lt(*pulled_from_scan, b->key.k.p)) {
-			ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
-						bpos_successor(child->key.k.p), b->key.k.p);
-			if (ret)
-				goto err;
-
-			*pulled_from_scan = b->key.k.p;
-			ret = DID_FILL_FROM_SCAN;
-		} else {
-			ret = set_node_max(c, child, b->key.k.p);
-		}
-	}
-err:
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b,
-					      struct bpos *pulled_from_scan)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_and_journal_iter iter;
-	struct bkey_s_c k;
-	struct bkey_buf prev_k, cur_k;
-	struct btree *prev = NULL, *cur = NULL;
-	bool have_child, new_pass = false;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	if (!b->c.level)
-		return 0;
-
-	bch2_bkey_buf_init(&prev_k);
-	bch2_bkey_buf_init(&cur_k);
-again:
-	cur = prev = NULL;
-	have_child = new_pass = false;
-	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
-	iter.prefetch = true;
-
-	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
-		BUG_ON(bpos_gt(k.k->p, b->data->max_key));
-
-		bch2_btree_and_journal_iter_advance(&iter);
-		bch2_bkey_buf_reassemble(&cur_k, c, k);
-
-		cur = bch2_btree_node_get_noiter(trans, cur_k.k,
-					b->c.btree_id, b->c.level - 1,
-					false);
-		ret = PTR_ERR_OR_ZERO(cur);
-
-		printbuf_reset(&buf);
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
-
-		if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), c,
-				btree_node_unreadable,
-				"Topology repair: unreadable btree node at btree %s level %u:\n"
-				"  %s",
-				bch2_btree_id_str(b->c.btree_id),
-				b->c.level - 1,
-				buf.buf)) {
-			bch2_btree_node_evict(trans, cur_k.k);
-			cur = NULL;
-			ret = bch2_journal_key_delete(c, b->c.btree_id,
-						      b->c.level, cur_k.k->k.p);
-			if (ret)
-				break;
-
-			if (!btree_id_is_alloc(b->c.btree_id)) {
-				ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
-				if (ret)
-					break;
-			}
-			continue;
-		}
-
-		bch_err_msg(c, ret, "getting btree node");
-		if (ret)
-			break;
-
-		if (bch2_btree_node_is_stale(c, cur)) {
-			bch_info(c, "btree node %s older than nodes found by scanning", buf.buf);
-			six_unlock_read(&cur->c.lock);
-			bch2_btree_node_evict(trans, cur_k.k);
-			ret = bch2_journal_key_delete(c, b->c.btree_id,
-						      b->c.level, cur_k.k->k.p);
-			cur = NULL;
-			if (ret)
-				break;
-			continue;
-		}
-
-		ret = btree_check_node_boundaries(c, b, prev, cur, pulled_from_scan);
-		if (ret == DID_FILL_FROM_SCAN) {
-			new_pass = true;
-			ret = 0;
-		}
-
-		if (ret == DROP_THIS_NODE) {
-			six_unlock_read(&cur->c.lock);
-			bch2_btree_node_evict(trans, cur_k.k);
-			ret = bch2_journal_key_delete(c, b->c.btree_id,
-						      b->c.level, cur_k.k->k.p);
-			cur = NULL;
-			if (ret)
-				break;
-			continue;
-		}
-
-		if (prev)
-			six_unlock_read(&prev->c.lock);
-		prev = NULL;
-
-		if (ret == DROP_PREV_NODE) {
-			bch_info(c, "dropped prev node");
-			bch2_btree_node_evict(trans, prev_k.k);
-			ret = bch2_journal_key_delete(c, b->c.btree_id,
-						      b->c.level, prev_k.k->k.p);
-			if (ret)
-				break;
-
-			bch2_btree_and_journal_iter_exit(&iter);
-			goto again;
-		} else if (ret)
-			break;
-
-		prev = cur;
-		cur = NULL;
-		bch2_bkey_buf_copy(&prev_k, c, cur_k.k);
-	}
-
-	if (!ret && !IS_ERR_OR_NULL(prev)) {
-		BUG_ON(cur);
-		ret = btree_repair_node_end(c, b, prev, pulled_from_scan);
-		if (ret == DID_FILL_FROM_SCAN) {
-			new_pass = true;
-			ret = 0;
-		}
-	}
-
-	if (!IS_ERR_OR_NULL(prev))
-		six_unlock_read(&prev->c.lock);
-	prev = NULL;
-	if (!IS_ERR_OR_NULL(cur))
-		six_unlock_read(&cur->c.lock);
-	cur = NULL;
-
-	if (ret)
-		goto err;
-
-	bch2_btree_and_journal_iter_exit(&iter);
-
-	if (new_pass)
-		goto again;
-
-	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
-	iter.prefetch = true;
-
-	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-		bch2_bkey_buf_reassemble(&cur_k, c, k);
-		bch2_btree_and_journal_iter_advance(&iter);
-
-		cur = bch2_btree_node_get_noiter(trans, cur_k.k,
-					b->c.btree_id, b->c.level - 1,
-					false);
-		ret = PTR_ERR_OR_ZERO(cur);
-
-		bch_err_msg(c, ret, "getting btree node");
-		if (ret)
-			goto err;
-
-		ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan);
-		six_unlock_read(&cur->c.lock);
-		cur = NULL;
-
-		if (ret == DROP_THIS_NODE) {
-			bch2_btree_node_evict(trans, cur_k.k);
-			ret = bch2_journal_key_delete(c, b->c.btree_id,
-						      b->c.level, cur_k.k->k.p);
-			new_pass = true;
-		}
-
-		if (ret)
-			goto err;
-
-		have_child = true;
-	}
-
-	printbuf_reset(&buf);
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
-	if (mustfix_fsck_err_on(!have_child, c,
-			btree_node_topology_interior_node_empty,
-			"empty interior btree node at btree %s level %u\n"
-			"  %s",
-			bch2_btree_id_str(b->c.btree_id),
-			b->c.level, buf.buf))
-		ret = DROP_THIS_NODE;
-err:
-fsck_err:
-	if (!IS_ERR_OR_NULL(prev))
-		six_unlock_read(&prev->c.lock);
-	if (!IS_ERR_OR_NULL(cur))
-		six_unlock_read(&cur->c.lock);
-
-	bch2_btree_and_journal_iter_exit(&iter);
-
-	if (!ret && new_pass)
-		goto again;
-
-	BUG_ON(!ret && bch2_btree_node_check_topology(trans, b));
-
-	bch2_bkey_buf_exit(&prev_k, c);
-	bch2_bkey_buf_exit(&cur_k, c);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-int bch2_check_topology(struct bch_fs *c)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct bpos pulled_from_scan = POS_MIN;
-	int ret = 0;
-
-	for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
-		struct btree_root *r = bch2_btree_id_root(c, i);
-		bool reconstructed_root = false;
-
-		if (r->error) {
-			ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
-			if (ret)
-				break;
-reconstruct_root:
-			bch_info(c, "btree root %s unreadable, must recover from scan", bch2_btree_id_str(i));
-
-			r->alive = false;
-			r->error = 0;
-
-			if (!bch2_btree_has_scanned_nodes(c, i)) {
-				mustfix_fsck_err(c, btree_root_unreadable_and_scan_found_nothing,
-						 "no nodes found for btree %s, continue?", bch2_btree_id_str(i));
-				bch2_btree_root_alloc_fake_trans(trans, i, 0);
-			} else {
-				bch2_btree_root_alloc_fake_trans(trans, i, 1);
-				bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
-				ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX);
-				if (ret)
-					break;
-			}
-
-			reconstructed_root = true;
-		}
-
-		struct btree *b = r->b;
-
-		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-		ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan);
-		six_unlock_read(&b->c.lock);
-
-		if (ret == DROP_THIS_NODE) {
-			bch2_btree_node_hash_remove(&c->btree_cache, b);
-			mutex_lock(&c->btree_cache.lock);
-			list_move(&b->list, &c->btree_cache.freeable);
-			mutex_unlock(&c->btree_cache.lock);
-
-			r->b = NULL;
-
-			if (!reconstructed_root)
-				goto reconstruct_root;
-
-			bch_err(c, "empty btree root %s", bch2_btree_id_str(i));
-			bch2_btree_root_alloc_fake_trans(trans, i, 0);
-			r->alive = false;
-			ret = 0;
-		}
-	}
-fsck_err:
-	bch2_trans_put(trans);
-	return ret;
-}
-
-/* marking of btree keys/nodes: */
-
-static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
-			    unsigned level, struct btree **prev,
-			    struct btree_iter *iter, struct bkey_s_c k,
-			    bool initial)
-{
-	struct bch_fs *c = trans->c;
-
-	if (iter) {
-		struct btree_path *path = btree_iter_path(trans, iter);
-		struct btree *b = path_l(path)->b;
-
-		if (*prev != b) {
-			int ret = bch2_btree_node_check_topology(trans, b);
-			if (ret)
-				return ret;
-		}
-		*prev = b;
-	}
-
-	struct bkey deleted = KEY(0, 0, 0);
-	struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	deleted.p = k.k->p;
-
-	if (initial) {
-		BUG_ON(bch2_journal_seq_verify &&
-		       k.k->version.lo > atomic64_read(&c->journal.seq));
-
-		if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c,
-				bkey_version_in_future,
-				"key version number higher than recorded: %llu > %llu",
-				k.k->version.lo,
-				atomic64_read(&c->key_version)))
-			atomic64_set(&c->key_version, k.k->version.lo);
-	}
-
-	if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
-				c, btree_bitmap_not_marked,
-				"btree ptr not marked in member info btree allocated bitmap\n  %s",
-				(bch2_bkey_val_to_text(&buf, c, k),
-				 buf.buf))) {
-		mutex_lock(&c->sb_lock);
-		bch2_dev_btree_bitmap_mark(c, k);
-		bch2_write_super(c);
-		mutex_unlock(&c->sb_lock);
-	}
-
-	/*
-	 * We require a commit before key_trigger() because
-	 * key_trigger(BTREE_TRIGGER_GC) is not idempotant; we'll calculate the
-	 * wrong result if we run it multiple times.
-	 */
-	unsigned flags = !iter ? BTREE_TRIGGER_is_root : 0;
-
-	ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
-			       BTREE_TRIGGER_check_repair|flags);
-	if (ret)
-		goto out;
-
-	if (trans->nr_updates) {
-		ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
-			-BCH_ERR_transaction_restart_nested;
-		goto out;
-	}
-
-	ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
-			       BTREE_TRIGGER_gc|flags);
-out:
-fsck_err:
-	printbuf_exit(&buf);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial)
-{
-	struct bch_fs *c = trans->c;
-	int level = 0, target_depth = btree_node_type_needs_gc(__btree_node_type(0, btree)) ? 0 : 1;
-	int ret = 0;
-
-	/* We need to make sure every leaf node is readable before going RW */
-	if (initial)
-		target_depth = 0;
-
-	/* root */
-	mutex_lock(&c->btree_root_lock);
-	struct btree *b = bch2_btree_id_root(c, btree)->b;
-	if (!btree_node_fake(b)) {
-		gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX));
-		ret = lockrestart_do(trans,
-			bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1,
-					 NULL, NULL, bkey_i_to_s_c(&b->key), initial));
-		level = b->c.level;
-	}
-	mutex_unlock(&c->btree_root_lock);
-
-	if (ret)
-		return ret;
-
-	for (; level >= target_depth; --level) {
-		struct btree *prev = NULL;
-		struct btree_iter iter;
-		bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level,
-					  BTREE_ITER_prefetch);
-
-		ret = for_each_btree_key_continue(trans, iter, 0, k, ({
-			gc_pos_set(c, gc_pos_btree(btree, level, k.k->p));
-			bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial);
-		}));
-		if (ret)
-			break;
-	}
-
-	return ret;
-}
-
-static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
-{
-	return  (int) btree_id_to_gc_phase(l) -
-		(int) btree_id_to_gc_phase(r);
-}
-
-static int bch2_gc_btrees(struct bch_fs *c)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	enum btree_id ids[BTREE_ID_NR];
-	unsigned i;
-	int ret = 0;
-
-	for (i = 0; i < BTREE_ID_NR; i++)
-		ids[i] = i;
-	bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
-
-	for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
-		unsigned btree = i < BTREE_ID_NR ? ids[i] : i;
-
-		if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b))
-			continue;
-
-		ret = bch2_gc_btree(trans, btree, true);
-
-		if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
-					c, btree_node_read_error,
-			       "btree node read error for %s",
-			       bch2_btree_id_str(btree)))
-			ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
-	}
-fsck_err:
-	bch2_trans_put(trans);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int bch2_mark_superblocks(struct bch_fs *c)
-{
-	mutex_lock(&c->sb_lock);
-	gc_pos_set(c, gc_phase(GC_PHASE_SB));
-
-	int ret = bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc);
-	mutex_unlock(&c->sb_lock);
-	return ret;
-}
-
-static void bch2_gc_free(struct bch_fs *c)
-{
-	genradix_free(&c->reflink_gc_table);
-	genradix_free(&c->gc_stripes);
-
-	for_each_member_device(c, ca) {
-		kvfree(rcu_dereference_protected(ca->buckets_gc, 1));
-		ca->buckets_gc = NULL;
-
-		free_percpu(ca->usage_gc);
-		ca->usage_gc = NULL;
-	}
-
-	free_percpu(c->usage_gc);
-	c->usage_gc = NULL;
-}
-
-static int bch2_gc_done(struct bch_fs *c)
-{
-	struct bch_dev *ca = NULL;
-	struct printbuf buf = PRINTBUF;
-	unsigned i;
-	int ret = 0;
-
-	percpu_down_write(&c->mark_lock);
-
-#define copy_field(_err, _f, _msg, ...)						\
-	if (fsck_err_on(dst->_f != src->_f, c, _err,				\
-			_msg ": got %llu, should be %llu" , ##__VA_ARGS__,	\
-			dst->_f, src->_f))					\
-		dst->_f = src->_f
-#define copy_dev_field(_err, _f, _msg, ...)					\
-	copy_field(_err, _f, "dev %u has wrong " _msg, ca->dev_idx, ##__VA_ARGS__)
-#define copy_fs_field(_err, _f, _msg, ...)					\
-	copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__)
-
-	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
-		bch2_fs_usage_acc_to_base(c, i);
-
-	__for_each_member_device(c, ca) {
-		struct bch_dev_usage *dst = ca->usage_base;
-		struct bch_dev_usage *src = (void *)
-			bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc,
-					     dev_usage_u64s());
-
-		for (i = 0; i < BCH_DATA_NR; i++) {
-			copy_dev_field(dev_usage_buckets_wrong,
-				       d[i].buckets,	"%s buckets", bch2_data_type_str(i));
-			copy_dev_field(dev_usage_sectors_wrong,
-				       d[i].sectors,	"%s sectors", bch2_data_type_str(i));
-			copy_dev_field(dev_usage_fragmented_wrong,
-				       d[i].fragmented,	"%s fragmented", bch2_data_type_str(i));
-		}
-	}
-
-	{
-		unsigned nr = fs_usage_u64s(c);
-		struct bch_fs_usage *dst = c->usage_base;
-		struct bch_fs_usage *src = (void *)
-			bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
-
-		copy_fs_field(fs_usage_hidden_wrong,
-			      b.hidden,		"hidden");
-		copy_fs_field(fs_usage_btree_wrong,
-			      b.btree,		"btree");
-
-		copy_fs_field(fs_usage_data_wrong,
-			      b.data,	"data");
-		copy_fs_field(fs_usage_cached_wrong,
-			      b.cached,	"cached");
-		copy_fs_field(fs_usage_reserved_wrong,
-			      b.reserved,	"reserved");
-		copy_fs_field(fs_usage_nr_inodes_wrong,
-			      b.nr_inodes,"nr_inodes");
-
-		for (i = 0; i < BCH_REPLICAS_MAX; i++)
-			copy_fs_field(fs_usage_persistent_reserved_wrong,
-				      persistent_reserved[i],
-				      "persistent_reserved[%i]", i);
-
-		for (i = 0; i < c->replicas.nr; i++) {
-			struct bch_replicas_entry_v1 *e =
-				cpu_replicas_entry(&c->replicas, i);
-
-			printbuf_reset(&buf);
-			bch2_replicas_entry_to_text(&buf, e);
-
-			copy_fs_field(fs_usage_replicas_wrong,
-				      replicas[i], "%s", buf.buf);
-		}
-	}
-
-#undef copy_fs_field
-#undef copy_dev_field
-#undef copy_stripe_field
-#undef copy_field
-fsck_err:
-	bch2_dev_put(ca);
-	bch_err_fn(c, ret);
-	percpu_up_write(&c->mark_lock);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int bch2_gc_start(struct bch_fs *c)
-{
-	BUG_ON(c->usage_gc);
-
-	c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
-					 sizeof(u64), GFP_KERNEL);
-	if (!c->usage_gc) {
-		bch_err(c, "error allocating c->usage_gc");
-		return -BCH_ERR_ENOMEM_gc_start;
-	}
-
-	for_each_member_device(c, ca) {
-		BUG_ON(ca->usage_gc);
-
-		ca->usage_gc = alloc_percpu(struct bch_dev_usage);
-		if (!ca->usage_gc) {
-			bch_err(c, "error allocating ca->usage_gc");
-			bch2_dev_put(ca);
-			return -BCH_ERR_ENOMEM_gc_start;
-		}
-
-		this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets,
-			       ca->mi.nbuckets - ca->mi.first_bucket);
-	}
-
-	return 0;
-}
-
-/* returns true if not equal */
-static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
-				     struct bch_alloc_v4 r)
-{
-	return  l.gen != r.gen				||
-		l.oldest_gen != r.oldest_gen		||
-		l.data_type != r.data_type		||
-		l.dirty_sectors	!= r.dirty_sectors	||
-		l.cached_sectors != r.cached_sectors	 ||
-		l.stripe_redundancy != r.stripe_redundancy ||
-		l.stripe != r.stripe;
-}
-
-static int bch2_alloc_write_key(struct btree_trans *trans,
-				struct btree_iter *iter,
-				struct bch_dev *ca,
-				struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_i_alloc_v4 *a;
-	struct bch_alloc_v4 old_gc, gc, old_convert, new;
-	const struct bch_alloc_v4 *old;
-	int ret;
-
-	old = bch2_alloc_to_v4(k, &old_convert);
-	gc = new = *old;
-
-	percpu_down_read(&c->mark_lock);
-	__bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset));
-
-	old_gc = gc;
-
-	if ((old->data_type == BCH_DATA_sb ||
-	     old->data_type == BCH_DATA_journal) &&
-	    !bch2_dev_is_online(ca)) {
-		gc.data_type = old->data_type;
-		gc.dirty_sectors = old->dirty_sectors;
-	}
-
-	/*
-	 * gc.data_type doesn't yet include need_discard & need_gc_gen states -
-	 * fix that here:
-	 */
-	alloc_data_type_set(&gc, gc.data_type);
-
-	if (gc.data_type != old_gc.data_type ||
-	    gc.dirty_sectors != old_gc.dirty_sectors)
-		bch2_dev_usage_update(c, ca, &old_gc, &gc, 0, true);
-	percpu_up_read(&c->mark_lock);
-
-	if (fsck_err_on(new.data_type != gc.data_type, c,
-			alloc_key_data_type_wrong,
-			"bucket %llu:%llu gen %u has wrong data_type"
-			": got %s, should be %s",
-			iter->pos.inode, iter->pos.offset,
-			gc.gen,
-			bch2_data_type_str(new.data_type),
-			bch2_data_type_str(gc.data_type)))
-		new.data_type = gc.data_type;
-
-#define copy_bucket_field(_errtype, _f)					\
-	if (fsck_err_on(new._f != gc._f, c, _errtype,			\
-			"bucket %llu:%llu gen %u data type %s has wrong " #_f	\
-			": got %u, should be %u",			\
-			iter->pos.inode, iter->pos.offset,		\
-			gc.gen,						\
-			bch2_data_type_str(gc.data_type),		\
-			new._f, gc._f))					\
-		new._f = gc._f;						\
-
-	copy_bucket_field(alloc_key_gen_wrong,
-			  gen);
-	copy_bucket_field(alloc_key_dirty_sectors_wrong,
-			  dirty_sectors);
-	copy_bucket_field(alloc_key_cached_sectors_wrong,
-			  cached_sectors);
-	copy_bucket_field(alloc_key_stripe_wrong,
-			  stripe);
-	copy_bucket_field(alloc_key_stripe_redundancy_wrong,
-			  stripe_redundancy);
-#undef copy_bucket_field
-
-	if (!bch2_alloc_v4_cmp(*old, new))
-		return 0;
-
-	a = bch2_alloc_to_v4_mut(trans, k);
-	ret = PTR_ERR_OR_ZERO(a);
-	if (ret)
-		return ret;
-
-	a->v = new;
-
-	/*
-	 * The trigger normally makes sure this is set, but we're not running
-	 * triggers:
-	 */
-	if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
-		a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
-
-	ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_norun);
-fsck_err:
-	return ret;
-}
-
-static int bch2_gc_alloc_done(struct bch_fs *c)
-{
-	int ret = 0;
-
-	for_each_member_device(c, ca) {
-		ret = bch2_trans_run(c,
-			for_each_btree_key_upto_commit(trans, iter, BTREE_ID_alloc,
-					POS(ca->dev_idx, ca->mi.first_bucket),
-					POS(ca->dev_idx, ca->mi.nbuckets - 1),
-					BTREE_ITER_slots|BTREE_ITER_prefetch, k,
-					NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
-				bch2_alloc_write_key(trans, &iter, ca, k)));
-		if (ret) {
-			bch2_dev_put(ca);
-			break;
-		}
-	}
-
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int bch2_gc_alloc_start(struct bch_fs *c)
-{
-	for_each_member_device(c, ca) {
-		struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
-				ca->mi.nbuckets * sizeof(struct bucket),
-				GFP_KERNEL|__GFP_ZERO);
-		if (!buckets) {
-			bch2_dev_put(ca);
-			bch_err(c, "error allocating ca->buckets[gc]");
-			return -BCH_ERR_ENOMEM_gc_alloc_start;
-		}
-
-		buckets->first_bucket	= ca->mi.first_bucket;
-		buckets->nbuckets	= ca->mi.nbuckets;
-		rcu_assign_pointer(ca->buckets_gc, buckets);
-	}
-
-	struct bch_dev *ca = NULL;
-	int ret = bch2_trans_run(c,
-		for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
-					 BTREE_ITER_prefetch, k, ({
-			ca = bch2_dev_iterate(c, ca, k.k->p.inode);
-			if (!ca) {
-				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
-				continue;
-			}
-
-			struct bch_alloc_v4 a_convert;
-			const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
-
-			struct bucket *g = gc_bucket(ca, k.k->p.offset);
-			g->gen_valid	= 1;
-			g->gen		= a->gen;
-			0;
-		})));
-	bch2_dev_put(ca);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int bch2_gc_write_reflink_key(struct btree_trans *trans,
-				     struct btree_iter *iter,
-				     struct bkey_s_c k,
-				     size_t *idx)
-{
-	struct bch_fs *c = trans->c;
-	const __le64 *refcount = bkey_refcount_c(k);
-	struct printbuf buf = PRINTBUF;
-	struct reflink_gc *r;
-	int ret = 0;
-
-	if (!refcount)
-		return 0;
-
-	while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
-	       r->offset < k.k->p.offset)
-		++*idx;
-
-	if (!r ||
-	    r->offset != k.k->p.offset ||
-	    r->size != k.k->size) {
-		bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
-		return -EINVAL;
-	}
-
-	if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
-			reflink_v_refcount_wrong,
-			"reflink key has wrong refcount:\n"
-			"  %s\n"
-			"  should be %u",
-			(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
-			r->refcount)) {
-		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
-		ret = PTR_ERR_OR_ZERO(new);
-		if (ret)
-			goto out;
-
-		if (!r->refcount)
-			new->k.type = KEY_TYPE_deleted;
-		else
-			*bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
-		ret = bch2_trans_update(trans, iter, new, 0);
-	}
-out:
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int bch2_gc_reflink_done(struct bch_fs *c)
-{
-	size_t idx = 0;
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter,
-				BTREE_ID_reflink, POS_MIN,
-				BTREE_ITER_prefetch, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			bch2_gc_write_reflink_key(trans, &iter, k, &idx)));
-	c->reflink_gc_nr = 0;
-	return ret;
-}
-
-static int bch2_gc_reflink_start(struct bch_fs *c)
-{
-	c->reflink_gc_nr = 0;
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
-				   BTREE_ITER_prefetch, k, ({
-			const __le64 *refcount = bkey_refcount_c(k);
-
-			if (!refcount)
-				continue;
-
-			struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table,
-							c->reflink_gc_nr++, GFP_KERNEL);
-			if (!r) {
-				ret = -BCH_ERR_ENOMEM_gc_reflink_start;
-				break;
-			}
-
-			r->offset	= k.k->p.offset;
-			r->size		= k.k->size;
-			r->refcount	= 0;
-			0;
-		})));
-
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int bch2_gc_write_stripes_key(struct btree_trans *trans,
-				     struct btree_iter *iter,
-				     struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-	const struct bch_stripe *s;
-	struct gc_stripe *m;
-	bool bad = false;
-	unsigned i;
-	int ret = 0;
-
-	if (k.k->type != KEY_TYPE_stripe)
-		return 0;
-
-	s = bkey_s_c_to_stripe(k).v;
-	m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
-
-	for (i = 0; i < s->nr_blocks; i++) {
-		u32 old = stripe_blockcount_get(s, i);
-		u32 new = (m ? m->block_sectors[i] : 0);
-
-		if (old != new) {
-			prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n",
-				   i, old, new);
-			bad = true;
-		}
-	}
-
-	if (bad)
-		bch2_bkey_val_to_text(&buf, c, k);
-
-	if (fsck_err_on(bad, c, stripe_sector_count_wrong,
-			"%s", buf.buf)) {
-		struct bkey_i_stripe *new;
-
-		new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
-		ret = PTR_ERR_OR_ZERO(new);
-		if (ret)
-			return ret;
-
-		bkey_reassemble(&new->k_i, k);
-
-		for (i = 0; i < new->v.nr_blocks; i++)
-			stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
-
-		ret = bch2_trans_update(trans, iter, &new->k_i, 0);
-	}
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int bch2_gc_stripes_done(struct bch_fs *c)
-{
-	return bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter,
-				BTREE_ID_stripes, POS_MIN,
-				BTREE_ITER_prefetch, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			bch2_gc_write_stripes_key(trans, &iter, k)));
-}
-
-/**
- * bch2_check_allocations - walk all references to buckets, and recompute them:
- *
- * @c:			filesystem object
- *
- * Returns: 0 on success, or standard errcode on failure
- *
- * Order matters here:
- *  - Concurrent GC relies on the fact that we have a total ordering for
- *    everything that GC walks - see  gc_will_visit_node(),
- *    gc_will_visit_root()
- *
- *  - also, references move around in the course of index updates and
- *    various other crap: everything needs to agree on the ordering
- *    references are allowed to move around in - e.g., we're allowed to
- *    start with a reference owned by an open_bucket (the allocator) and
- *    move it to the btree, but not the reverse.
- *
- *    This is necessary to ensure that gc doesn't miss references that
- *    move around - if references move backwards in the ordering GC
- *    uses, GC could skip past them
- */
-int bch2_check_allocations(struct bch_fs *c)
-{
-	int ret;
-
-	lockdep_assert_held(&c->state_lock);
-
-	down_write(&c->gc_lock);
-
-	bch2_btree_interior_updates_flush(c);
-
-	ret   = bch2_gc_start(c) ?:
-		bch2_gc_alloc_start(c) ?:
-		bch2_gc_reflink_start(c);
-	if (ret)
-		goto out;
-
-	gc_pos_set(c, gc_phase(GC_PHASE_START));
-
-	ret = bch2_mark_superblocks(c);
-	BUG_ON(ret);
-
-	ret = bch2_gc_btrees(c);
-	if (ret)
-		goto out;
-
-	c->gc_count++;
-
-	bch2_journal_block(&c->journal);
-out:
-	ret   = bch2_gc_alloc_done(c) ?:
-		bch2_gc_done(c) ?:
-		bch2_gc_stripes_done(c) ?:
-		bch2_gc_reflink_done(c);
-
-	bch2_journal_unblock(&c->journal);
-
-	percpu_down_write(&c->mark_lock);
-	/* Indicates that gc is no longer in progress: */
-	__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
-
-	bch2_gc_free(c);
-	percpu_up_write(&c->mark_lock);
-
-	up_write(&c->gc_lock);
-
-	/*
-	 * At startup, allocations can happen directly instead of via the
-	 * allocator thread - issue wakeup in case they blocked on gc_lock:
-	 */
-	closure_wake_up(&c->freelist_wait);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int gc_btree_gens_key(struct btree_trans *trans,
-			     struct btree_iter *iter,
-			     struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	struct bkey_i *u;
-	int ret;
-
-	if (unlikely(test_bit(BCH_FS_going_ro, &c->flags)))
-		return -EROFS;
-
-	percpu_down_read(&c->mark_lock);
-	rcu_read_lock();
-	bkey_for_each_ptr(ptrs, ptr) {
-		struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
-		if (!ca)
-			continue;
-
-		if (dev_ptr_stale(ca, ptr) > 16) {
-			rcu_read_unlock();
-			percpu_up_read(&c->mark_lock);
-			goto update;
-		}
-	}
-
-	bkey_for_each_ptr(ptrs, ptr) {
-		struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
-		if (!ca)
-			continue;
-
-		u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
-		if (gen_after(*gen, ptr->gen))
-			*gen = ptr->gen;
-	}
-	rcu_read_unlock();
-	percpu_up_read(&c->mark_lock);
-	return 0;
-update:
-	u = bch2_bkey_make_mut(trans, iter, &k, 0);
-	ret = PTR_ERR_OR_ZERO(u);
-	if (ret)
-		return ret;
-
-	bch2_extent_normalize(c, bkey_i_to_s(u));
-	return 0;
-}
-
-static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev *ca,
-				       struct btree_iter *iter, struct bkey_s_c k)
-{
-	struct bch_alloc_v4 a_convert;
-	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
-	struct bkey_i_alloc_v4 *a_mut;
-	int ret;
-
-	if (a->oldest_gen == ca->oldest_gen[iter->pos.offset])
-		return 0;
-
-	a_mut = bch2_alloc_to_v4_mut(trans, k);
-	ret = PTR_ERR_OR_ZERO(a_mut);
-	if (ret)
-		return ret;
-
-	a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
-	alloc_data_type_set(&a_mut->v, a_mut->v.data_type);
-
-	return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
-}
-
-int bch2_gc_gens(struct bch_fs *c)
-{
-	u64 b, start_time = local_clock();
-	int ret;
-
-	/*
-	 * Ideally we would be using state_lock and not gc_lock here, but that
-	 * introduces a deadlock in the RO path - we currently take the state
-	 * lock at the start of going RO, thus the gc thread may get stuck:
-	 */
-	if (!mutex_trylock(&c->gc_gens_lock))
-		return 0;
-
-	trace_and_count(c, gc_gens_start, c);
-	down_read(&c->gc_lock);
-
-	for_each_member_device(c, ca) {
-		struct bucket_gens *gens = bucket_gens(ca);
-
-		BUG_ON(ca->oldest_gen);
-
-		ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL);
-		if (!ca->oldest_gen) {
-			bch2_dev_put(ca);
-			ret = -BCH_ERR_ENOMEM_gc_gens;
-			goto err;
-		}
-
-		for (b = gens->first_bucket;
-		     b < gens->nbuckets; b++)
-			ca->oldest_gen[b] = gens->b[b];
-	}
-
-	for (unsigned i = 0; i < BTREE_ID_NR; i++)
-		if (btree_type_has_ptrs(i)) {
-			c->gc_gens_btree = i;
-			c->gc_gens_pos = POS_MIN;
-
-			ret = bch2_trans_run(c,
-				for_each_btree_key_commit(trans, iter, i,
-						POS_MIN,
-						BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
-						k,
-						NULL, NULL,
-						BCH_TRANS_COMMIT_no_enospc,
-					gc_btree_gens_key(trans, &iter, k)));
-			if (ret)
-				goto err;
-		}
-
-	struct bch_dev *ca = NULL;
-	ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
-				POS_MIN,
-				BTREE_ITER_prefetch,
-				k,
-				NULL, NULL,
-				BCH_TRANS_COMMIT_no_enospc, ({
-			ca = bch2_dev_iterate(c, ca, k.k->p.inode);
-			if (!ca) {
-				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
-				continue;
-			}
-			bch2_alloc_write_oldest_gen(trans, ca, &iter, k);
-		})));
-	bch2_dev_put(ca);
-
-	if (ret)
-		goto err;
-
-	c->gc_gens_btree	= 0;
-	c->gc_gens_pos		= POS_MIN;
-
-	c->gc_count++;
-
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
-	trace_and_count(c, gc_gens_end, c);
-err:
-	for_each_member_device(c, ca) {
-		kvfree(ca->oldest_gen);
-		ca->oldest_gen = NULL;
-	}
-
-	up_read(&c->gc_lock);
-	mutex_unlock(&c->gc_gens_lock);
-	if (!bch2_err_matches(ret, EROFS))
-		bch_err_fn(c, ret);
-	return ret;
-}
-
-static void bch2_gc_gens_work(struct work_struct *work)
-{
-	struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work);
-	bch2_gc_gens(c);
-	bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
-}
-
-void bch2_gc_gens_async(struct bch_fs *c)
-{
-	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_gc_gens) &&
-	    !queue_work(c->write_ref_wq, &c->gc_gens_work))
-		bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
-}
-
-void bch2_fs_gc_init(struct bch_fs *c)
-{
-	seqcount_init(&c->gc_pos_lock);
-
-	INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work);
-}
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
deleted file mode 100644
index 1b6489d8e0f4..000000000000
--- a/fs/bcachefs/btree_gc.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_GC_H
-#define _BCACHEFS_BTREE_GC_H
-
-#include "bkey.h"
-#include "btree_types.h"
-
-int bch2_check_topology(struct bch_fs *);
-int bch2_check_allocations(struct bch_fs *);
-
-/*
- * For concurrent mark and sweep (with other index updates), we define a total
- * ordering of _all_ references GC walks:
- *
- * Note that some references will have the same GC position as others - e.g.
- * everything within the same btree node; in those cases we're relying on
- * whatever locking exists for where those references live, i.e. the write lock
- * on a btree node.
- *
- * That locking is also required to ensure GC doesn't pass the updater in
- * between the updater adding/removing the reference and updating the GC marks;
- * without that, we would at best double count sometimes.
- *
- * That part is important - whenever calling bch2_mark_pointers(), a lock _must_
- * be held that prevents GC from passing the position the updater is at.
- *
- * (What about the start of gc, when we're clearing all the marks? GC clears the
- * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc
- * position inside its cmpxchg loop, so crap magically works).
- */
-
-/* Position of (the start of) a gc phase: */
-static inline struct gc_pos gc_phase(enum gc_phase phase)
-{
-	return (struct gc_pos) {
-		.phase	= phase,
-		.level	= 0,
-		.pos	= POS_MIN,
-	};
-}
-
-static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
-{
-	return   cmp_int(l.phase, r.phase) ?:
-		-cmp_int(l.level, r.level) ?:
-		 bpos_cmp(l.pos, r.pos);
-}
-
-static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
-{
-	switch (id) {
-#define x(name, v, ...) case BTREE_ID_##name: return GC_PHASE_BTREE_##name;
-	BCH_BTREE_IDS()
-#undef x
-	default:
-		BUG();
-	}
-}
-
-static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level,
-					 struct bpos pos)
-{
-	return (struct gc_pos) {
-		.phase	= btree_id_to_gc_phase(btree),
-		.level	= level,
-		.pos	= pos,
-	};
-}
-
-/*
- * GC position of the pointers within a btree node: note, _not_ for &b->key
- * itself, that lives in the parent node:
- */
-static inline struct gc_pos gc_pos_btree_node(struct btree *b)
-{
-	return gc_pos_btree(b->c.btree_id, b->c.level, b->key.k.p);
-}
-
-static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
-{
-	unsigned seq;
-	bool ret;
-
-	do {
-		seq = read_seqcount_begin(&c->gc_pos_lock);
-		ret = gc_pos_cmp(pos, c->gc_pos) <= 0;
-	} while (read_seqcount_retry(&c->gc_pos_lock, seq));
-
-	return ret;
-}
-
-int bch2_gc_gens(struct bch_fs *);
-void bch2_gc_gens_async(struct bch_fs *);
-void bch2_fs_gc_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_GC_H */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
deleted file mode 100644
index cbf8f5d90602..000000000000
--- a/fs/bcachefs/btree_io.c
+++ /dev/null
@@ -1,2373 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_methods.h"
-#include "bkey_sort.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "debug.h"
-#include "error.h"
-#include "extents.h"
-#include "io_write.h"
-#include "journal_reclaim.h"
-#include "journal_seq_blacklist.h"
-#include "recovery.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/sched/mm.h>
-
-static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn)
-{
-	prt_printf(out, "btree=%s l=%u seq %llux\n",
-		   bch2_btree_id_str(BTREE_NODE_ID(bn)),
-		   (unsigned) BTREE_NODE_LEVEL(bn), bn->keys.seq);
-	prt_str(out, "min: ");
-	bch2_bpos_to_text(out, bn->min_key);
-	prt_newline(out);
-	prt_str(out, "max: ");
-	bch2_bpos_to_text(out, bn->max_key);
-}
-
-void bch2_btree_node_io_unlock(struct btree *b)
-{
-	EBUG_ON(!btree_node_write_in_flight(b));
-
-	clear_btree_node_write_in_flight_inner(b);
-	clear_btree_node_write_in_flight(b);
-	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
-}
-
-void bch2_btree_node_io_lock(struct btree *b)
-{
-	bch2_assert_btree_nodes_not_locked();
-
-	wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
-			    TASK_UNINTERRUPTIBLE);
-}
-
-void __bch2_btree_node_wait_on_read(struct btree *b)
-{
-	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
-		       TASK_UNINTERRUPTIBLE);
-}
-
-void __bch2_btree_node_wait_on_write(struct btree *b)
-{
-	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
-		       TASK_UNINTERRUPTIBLE);
-}
-
-void bch2_btree_node_wait_on_read(struct btree *b)
-{
-	bch2_assert_btree_nodes_not_locked();
-
-	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
-		       TASK_UNINTERRUPTIBLE);
-}
-
-void bch2_btree_node_wait_on_write(struct btree *b)
-{
-	bch2_assert_btree_nodes_not_locked();
-
-	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
-		       TASK_UNINTERRUPTIBLE);
-}
-
-static void verify_no_dups(struct btree *b,
-			   struct bkey_packed *start,
-			   struct bkey_packed *end)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-	struct bkey_packed *k, *p;
-
-	if (start == end)
-		return;
-
-	for (p = start, k = bkey_p_next(start);
-	     k != end;
-	     p = k, k = bkey_p_next(k)) {
-		struct bkey l = bkey_unpack_key(b, p);
-		struct bkey r = bkey_unpack_key(b, k);
-
-		BUG_ON(bpos_ge(l.p, bkey_start_pos(&r)));
-	}
-#endif
-}
-
-static void set_needs_whiteout(struct bset *i, int v)
-{
-	struct bkey_packed *k;
-
-	for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
-		k->needs_whiteout = v;
-}
-
-static void btree_bounce_free(struct bch_fs *c, size_t size,
-			      bool used_mempool, void *p)
-{
-	if (used_mempool)
-		mempool_free(p, &c->btree_bounce_pool);
-	else
-		kvfree(p);
-}
-
-static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
-				bool *used_mempool)
-{
-	unsigned flags = memalloc_nofs_save();
-	void *p;
-
-	BUG_ON(size > c->opts.btree_node_size);
-
-	*used_mempool = false;
-	p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
-	if (!p) {
-		*used_mempool = true;
-		p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
-	}
-	memalloc_nofs_restore(flags);
-	return p;
-}
-
-static void sort_bkey_ptrs(const struct btree *bt,
-			   struct bkey_packed **ptrs, unsigned nr)
-{
-	unsigned n = nr, a = nr / 2, b, c, d;
-
-	if (!a)
-		return;
-
-	/* Heap sort: see lib/sort.c: */
-	while (1) {
-		if (a)
-			a--;
-		else if (--n)
-			swap(ptrs[0], ptrs[n]);
-		else
-			break;
-
-		for (b = a; c = 2 * b + 1, (d = c + 1) < n;)
-			b = bch2_bkey_cmp_packed(bt,
-					    ptrs[c],
-					    ptrs[d]) >= 0 ? c : d;
-		if (d == n)
-			b = c;
-
-		while (b != a &&
-		       bch2_bkey_cmp_packed(bt,
-				       ptrs[a],
-				       ptrs[b]) >= 0)
-			b = (b - 1) / 2;
-		c = b;
-		while (b != a) {
-			b = (b - 1) / 2;
-			swap(ptrs[b], ptrs[c]);
-		}
-	}
-}
-
-static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
-{
-	struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k;
-	bool used_mempool = false;
-	size_t bytes = b->whiteout_u64s * sizeof(u64);
-
-	if (!b->whiteout_u64s)
-		return;
-
-	new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool);
-
-	ptrs = ptrs_end = ((void *) new_whiteouts + bytes);
-
-	for (k = unwritten_whiteouts_start(b);
-	     k != unwritten_whiteouts_end(b);
-	     k = bkey_p_next(k))
-		*--ptrs = k;
-
-	sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs);
-
-	k = new_whiteouts;
-
-	while (ptrs != ptrs_end) {
-		bkey_p_copy(k, *ptrs);
-		k = bkey_p_next(k);
-		ptrs++;
-	}
-
-	verify_no_dups(b, new_whiteouts,
-		       (void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
-
-	memcpy_u64s(unwritten_whiteouts_start(b),
-		    new_whiteouts, b->whiteout_u64s);
-
-	btree_bounce_free(c, bytes, used_mempool, new_whiteouts);
-}
-
-static bool should_compact_bset(struct btree *b, struct bset_tree *t,
-				bool compacting, enum compact_mode mode)
-{
-	if (!bset_dead_u64s(b, t))
-		return false;
-
-	switch (mode) {
-	case COMPACT_LAZY:
-		return should_compact_bset_lazy(b, t) ||
-			(compacting && !bset_written(b, bset(b, t)));
-	case COMPACT_ALL:
-		return true;
-	default:
-		BUG();
-	}
-}
-
-static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
-{
-	bool ret = false;
-
-	for_each_bset(b, t) {
-		struct bset *i = bset(b, t);
-		struct bkey_packed *k, *n, *out, *start, *end;
-		struct btree_node_entry *src = NULL, *dst = NULL;
-
-		if (t != b->set && !bset_written(b, i)) {
-			src = container_of(i, struct btree_node_entry, keys);
-			dst = max(write_block(b),
-				  (void *) btree_bkey_last(b, t - 1));
-		}
-
-		if (src != dst)
-			ret = true;
-
-		if (!should_compact_bset(b, t, ret, mode)) {
-			if (src != dst) {
-				memmove(dst, src, sizeof(*src) +
-					le16_to_cpu(src->keys.u64s) *
-					sizeof(u64));
-				i = &dst->keys;
-				set_btree_bset(b, t, i);
-			}
-			continue;
-		}
-
-		start	= btree_bkey_first(b, t);
-		end	= btree_bkey_last(b, t);
-
-		if (src != dst) {
-			memmove(dst, src, sizeof(*src));
-			i = &dst->keys;
-			set_btree_bset(b, t, i);
-		}
-
-		out = i->start;
-
-		for (k = start; k != end; k = n) {
-			n = bkey_p_next(k);
-
-			if (!bkey_deleted(k)) {
-				bkey_p_copy(out, k);
-				out = bkey_p_next(out);
-			} else {
-				BUG_ON(k->needs_whiteout);
-			}
-		}
-
-		i->u64s = cpu_to_le16((u64 *) out - i->_data);
-		set_btree_bset_end(b, t);
-		bch2_bset_set_no_aux_tree(b, t);
-		ret = true;
-	}
-
-	bch2_verify_btree_nr_keys(b);
-
-	bch2_btree_build_aux_trees(b);
-
-	return ret;
-}
-
-bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
-			    enum compact_mode mode)
-{
-	return bch2_drop_whiteouts(b, mode);
-}
-
-static void btree_node_sort(struct bch_fs *c, struct btree *b,
-			    unsigned start_idx,
-			    unsigned end_idx)
-{
-	struct btree_node *out;
-	struct sort_iter_stack sort_iter;
-	struct bset_tree *t;
-	struct bset *start_bset = bset(b, &b->set[start_idx]);
-	bool used_mempool = false;
-	u64 start_time, seq = 0;
-	unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1;
-	bool sorting_entire_node = start_idx == 0 &&
-		end_idx == b->nsets;
-
-	sort_iter_stack_init(&sort_iter, b);
-
-	for (t = b->set + start_idx;
-	     t < b->set + end_idx;
-	     t++) {
-		u64s += le16_to_cpu(bset(b, t)->u64s);
-		sort_iter_add(&sort_iter.iter,
-			      btree_bkey_first(b, t),
-			      btree_bkey_last(b, t));
-	}
-
-	bytes = sorting_entire_node
-		? btree_buf_bytes(b)
-		: __vstruct_bytes(struct btree_node, u64s);
-
-	out = btree_bounce_alloc(c, bytes, &used_mempool);
-
-	start_time = local_clock();
-
-	u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter);
-
-	out->keys.u64s = cpu_to_le16(u64s);
-
-	BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes);
-
-	if (sorting_entire_node)
-		bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
-				       start_time);
-
-	/* Make sure we preserve bset journal_seq: */
-	for (t = b->set + start_idx; t < b->set + end_idx; t++)
-		seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq));
-	start_bset->journal_seq = cpu_to_le64(seq);
-
-	if (sorting_entire_node) {
-		u64s = le16_to_cpu(out->keys.u64s);
-
-		BUG_ON(bytes != btree_buf_bytes(b));
-
-		/*
-		 * Our temporary buffer is the same size as the btree node's
-		 * buffer, we can just swap buffers instead of doing a big
-		 * memcpy()
-		 */
-		*out = *b->data;
-		out->keys.u64s = cpu_to_le16(u64s);
-		swap(out, b->data);
-		set_btree_bset(b, b->set, &b->data->keys);
-	} else {
-		start_bset->u64s = out->keys.u64s;
-		memcpy_u64s(start_bset->start,
-			    out->keys.start,
-			    le16_to_cpu(out->keys.u64s));
-	}
-
-	for (i = start_idx + 1; i < end_idx; i++)
-		b->nr.bset_u64s[start_idx] +=
-			b->nr.bset_u64s[i];
-
-	b->nsets -= shift;
-
-	for (i = start_idx + 1; i < b->nsets; i++) {
-		b->nr.bset_u64s[i]	= b->nr.bset_u64s[i + shift];
-		b->set[i]		= b->set[i + shift];
-	}
-
-	for (i = b->nsets; i < MAX_BSETS; i++)
-		b->nr.bset_u64s[i] = 0;
-
-	set_btree_bset_end(b, &b->set[start_idx]);
-	bch2_bset_set_no_aux_tree(b, &b->set[start_idx]);
-
-	btree_bounce_free(c, bytes, used_mempool, out);
-
-	bch2_verify_btree_nr_keys(b);
-}
-
-void bch2_btree_sort_into(struct bch_fs *c,
-			 struct btree *dst,
-			 struct btree *src)
-{
-	struct btree_nr_keys nr;
-	struct btree_node_iter src_iter;
-	u64 start_time = local_clock();
-
-	BUG_ON(dst->nsets != 1);
-
-	bch2_bset_set_no_aux_tree(dst, dst->set);
-
-	bch2_btree_node_iter_init_from_start(&src_iter, src);
-
-	nr = bch2_sort_repack(btree_bset_first(dst),
-			src, &src_iter,
-			&dst->format,
-			true);
-
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
-			       start_time);
-
-	set_btree_bset_end(dst, dst->set);
-
-	dst->nr.live_u64s	+= nr.live_u64s;
-	dst->nr.bset_u64s[0]	+= nr.bset_u64s[0];
-	dst->nr.packed_keys	+= nr.packed_keys;
-	dst->nr.unpacked_keys	+= nr.unpacked_keys;
-
-	bch2_verify_btree_nr_keys(dst);
-}
-
-/*
- * We're about to add another bset to the btree node, so if there's currently
- * too many bsets - sort some of them together:
- */
-static bool btree_node_compact(struct bch_fs *c, struct btree *b)
-{
-	unsigned unwritten_idx;
-	bool ret = false;
-
-	for (unwritten_idx = 0;
-	     unwritten_idx < b->nsets;
-	     unwritten_idx++)
-		if (!bset_written(b, bset(b, &b->set[unwritten_idx])))
-			break;
-
-	if (b->nsets - unwritten_idx > 1) {
-		btree_node_sort(c, b, unwritten_idx, b->nsets);
-		ret = true;
-	}
-
-	if (unwritten_idx > 1) {
-		btree_node_sort(c, b, 0, unwritten_idx);
-		ret = true;
-	}
-
-	return ret;
-}
-
-void bch2_btree_build_aux_trees(struct btree *b)
-{
-	for_each_bset(b, t)
-		bch2_bset_build_aux_tree(b, t,
-				!bset_written(b, bset(b, t)) &&
-				t == bset_tree_last(b));
-}
-
-/*
- * If we have MAX_BSETS (3) bsets, should we sort them all down to just one?
- *
- * The first bset is going to be of similar order to the size of the node, the
- * last bset is bounded by btree_write_set_buffer(), which is set to keep the
- * memmove on insert from being too expensive: the middle bset should, ideally,
- * be the geometric mean of the first and the last.
- *
- * Returns true if the middle bset is greater than that geometric mean:
- */
-static inline bool should_compact_all(struct bch_fs *c, struct btree *b)
-{
-	unsigned mid_u64s_bits =
-		(ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2;
-
-	return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits;
-}
-
-/*
- * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
- * inserted into
- *
- * Safe to call if there already is an unwritten bset - will only add a new bset
- * if @b doesn't already have one.
- *
- * Returns true if we sorted (i.e. invalidated iterators
- */
-void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_node_entry *bne;
-	bool reinit_iter = false;
-
-	EBUG_ON(!six_lock_counts(&b->c.lock).n[SIX_LOCK_write]);
-	BUG_ON(bset_written(b, bset(b, &b->set[1])));
-	BUG_ON(btree_node_just_written(b));
-
-	if (b->nsets == MAX_BSETS &&
-	    !btree_node_write_in_flight(b) &&
-	    should_compact_all(c, b)) {
-		bch2_btree_node_write(c, b, SIX_LOCK_write,
-				      BTREE_WRITE_init_next_bset);
-		reinit_iter = true;
-	}
-
-	if (b->nsets == MAX_BSETS &&
-	    btree_node_compact(c, b))
-		reinit_iter = true;
-
-	BUG_ON(b->nsets >= MAX_BSETS);
-
-	bne = want_new_bset(c, b);
-	if (bne)
-		bch2_bset_init_next(b, bne);
-
-	bch2_btree_build_aux_trees(b);
-
-	if (reinit_iter)
-		bch2_trans_node_reinit_iter(trans, b);
-}
-
-static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
-			  struct bch_dev *ca,
-			  struct btree *b, struct bset *i,
-			  unsigned offset, int write)
-{
-	prt_printf(out, bch2_log_msg(c, "%s"),
-		   write == READ
-		   ? "error validating btree node "
-		   : "corrupt btree node before write ");
-	if (ca)
-		prt_printf(out, "on %s ", ca->name);
-	prt_printf(out, "at btree ");
-	bch2_btree_pos_to_text(out, c, b);
-
-	printbuf_indent_add(out, 2);
-
-	prt_printf(out, "\nnode offset %u/%u",
-		   b->written, btree_ptr_sectors_written(&b->key));
-	if (i)
-		prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
-	prt_str(out, ": ");
-}
-
-__printf(9, 10)
-static int __btree_err(int ret,
-		       struct bch_fs *c,
-		       struct bch_dev *ca,
-		       struct btree *b,
-		       struct bset *i,
-		       int write,
-		       bool have_retry,
-		       enum bch_sb_error_id err_type,
-		       const char *fmt, ...)
-{
-	struct printbuf out = PRINTBUF;
-	bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes;
-	va_list args;
-
-	btree_err_msg(&out, c, ca, b, i, b->written, write);
-
-	va_start(args, fmt);
-	prt_vprintf(&out, fmt, args);
-	va_end(args);
-
-	if (write == WRITE) {
-		bch2_print_string_as_lines(KERN_ERR, out.buf);
-		ret = c->opts.errors == BCH_ON_ERROR_continue
-			? 0
-			: -BCH_ERR_fsck_errors_not_fixed;
-		goto out;
-	}
-
-	if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
-		ret = -BCH_ERR_btree_node_read_err_fixable;
-	if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
-		ret = -BCH_ERR_btree_node_read_err_bad_node;
-
-	if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable)
-		bch2_sb_error_count(c, err_type);
-
-	switch (ret) {
-	case -BCH_ERR_btree_node_read_err_fixable:
-		ret = !silent
-			? bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf)
-			: -BCH_ERR_fsck_fix;
-		if (ret != -BCH_ERR_fsck_fix &&
-		    ret != -BCH_ERR_fsck_ignore)
-			goto fsck_err;
-		ret = -BCH_ERR_fsck_fix;
-		break;
-	case -BCH_ERR_btree_node_read_err_want_retry:
-	case -BCH_ERR_btree_node_read_err_must_retry:
-		if (!silent)
-			bch2_print_string_as_lines(KERN_ERR, out.buf);
-		break;
-	case -BCH_ERR_btree_node_read_err_bad_node:
-		if (!silent)
-			bch2_print_string_as_lines(KERN_ERR, out.buf);
-		ret = bch2_topology_error(c);
-		break;
-	case -BCH_ERR_btree_node_read_err_incompatible:
-		if (!silent)
-			bch2_print_string_as_lines(KERN_ERR, out.buf);
-		ret = -BCH_ERR_fsck_errors_not_fixed;
-		break;
-	default:
-		BUG();
-	}
-out:
-fsck_err:
-	printbuf_exit(&out);
-	return ret;
-}
-
-#define btree_err(type, c, ca, b, i, _err_type, msg, ...)		\
-({									\
-	int _ret = __btree_err(type, c, ca, b, i, write, have_retry,	\
-			       BCH_FSCK_ERR_##_err_type,		\
-			       msg, ##__VA_ARGS__);			\
-									\
-	if (_ret != -BCH_ERR_fsck_fix) {				\
-		ret = _ret;						\
-		goto fsck_err;						\
-	}								\
-									\
-	*saw_error = true;						\
-})
-
-#define btree_err_on(cond, ...)	((cond) ? btree_err(__VA_ARGS__) : false)
-
-/*
- * When btree topology repair changes the start or end of a node, that might
- * mean we have to drop keys that are no longer inside the node:
- */
-__cold
-void bch2_btree_node_drop_keys_outside_node(struct btree *b)
-{
-	for_each_bset(b, t) {
-		struct bset *i = bset(b, t);
-		struct bkey_packed *k;
-
-		for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
-			if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0)
-				break;
-
-		if (k != i->start) {
-			unsigned shift = (u64 *) k - (u64 *) i->start;
-
-			memmove_u64s_down(i->start, k,
-					  (u64 *) vstruct_end(i) - (u64 *) k);
-			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift);
-			set_btree_bset_end(b, t);
-		}
-
-		for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
-			if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0)
-				break;
-
-		if (k != vstruct_last(i)) {
-			i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start);
-			set_btree_bset_end(b, t);
-		}
-	}
-
-	/*
-	 * Always rebuild search trees: eytzinger search tree nodes directly
-	 * depend on the values of min/max key:
-	 */
-	bch2_bset_set_no_aux_tree(b, b->set);
-	bch2_btree_build_aux_trees(b);
-	b->nr = bch2_btree_node_count_keys(b);
-
-	struct bkey_s_c k;
-	struct bkey unpacked;
-	struct btree_node_iter iter;
-	for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
-		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
-		BUG_ON(bpos_gt(k.k->p, b->data->max_key));
-	}
-}
-
-static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
-			 struct btree *b, struct bset *i,
-			 unsigned offset, unsigned sectors,
-			 int write, bool have_retry, bool *saw_error)
-{
-	unsigned version = le16_to_cpu(i->version);
-	struct printbuf buf1 = PRINTBUF;
-	struct printbuf buf2 = PRINTBUF;
-	int ret = 0;
-
-	btree_err_on(!bch2_version_compatible(version),
-		     -BCH_ERR_btree_node_read_err_incompatible,
-		     c, ca, b, i,
-		     btree_node_unsupported_version,
-		     "unsupported bset version %u.%u",
-		     BCH_VERSION_MAJOR(version),
-		     BCH_VERSION_MINOR(version));
-
-	if (btree_err_on(version < c->sb.version_min,
-			 -BCH_ERR_btree_node_read_err_fixable,
-			 c, NULL, b, i,
-			 btree_node_bset_older_than_sb_min,
-			 "bset version %u older than superblock version_min %u",
-			 version, c->sb.version_min)) {
-		mutex_lock(&c->sb_lock);
-		c->disk_sb.sb->version_min = cpu_to_le16(version);
-		bch2_write_super(c);
-		mutex_unlock(&c->sb_lock);
-	}
-
-	if (btree_err_on(BCH_VERSION_MAJOR(version) >
-			 BCH_VERSION_MAJOR(c->sb.version),
-			 -BCH_ERR_btree_node_read_err_fixable,
-			 c, NULL, b, i,
-			 btree_node_bset_newer_than_sb,
-			 "bset version %u newer than superblock version %u",
-			 version, c->sb.version)) {
-		mutex_lock(&c->sb_lock);
-		c->disk_sb.sb->version = cpu_to_le16(version);
-		bch2_write_super(c);
-		mutex_unlock(&c->sb_lock);
-	}
-
-	btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
-		     -BCH_ERR_btree_node_read_err_incompatible,
-		     c, ca, b, i,
-		     btree_node_unsupported_version,
-		     "BSET_SEPARATE_WHITEOUTS no longer supported");
-
-	if (btree_err_on(offset + sectors > btree_sectors(c),
-			 -BCH_ERR_btree_node_read_err_fixable,
-			 c, ca, b, i,
-			 bset_past_end_of_btree_node,
-			 "bset past end of btree node")) {
-		i->u64s = 0;
-		ret = 0;
-		goto out;
-	}
-
-	btree_err_on(offset && !i->u64s,
-		     -BCH_ERR_btree_node_read_err_fixable,
-		     c, ca, b, i,
-		     bset_empty,
-		     "empty bset");
-
-	btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset,
-		     -BCH_ERR_btree_node_read_err_want_retry,
-		     c, ca, b, i,
-		     bset_wrong_sector_offset,
-		     "bset at wrong sector offset");
-
-	if (!offset) {
-		struct btree_node *bn =
-			container_of(i, struct btree_node, keys);
-		/* These indicate that we read the wrong btree node: */
-
-		if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
-			struct bch_btree_ptr_v2 *bp =
-				&bkey_i_to_btree_ptr_v2(&b->key)->v;
-
-			/* XXX endianness */
-			btree_err_on(bp->seq != bn->keys.seq,
-				     -BCH_ERR_btree_node_read_err_must_retry,
-				     c, ca, b, NULL,
-				     bset_bad_seq,
-				     "incorrect sequence number (wrong btree node)");
-		}
-
-		btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
-			     -BCH_ERR_btree_node_read_err_must_retry,
-			     c, ca, b, i,
-			     btree_node_bad_btree,
-			     "incorrect btree id");
-
-		btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
-			     -BCH_ERR_btree_node_read_err_must_retry,
-			     c, ca, b, i,
-			     btree_node_bad_level,
-			     "incorrect level");
-
-		if (!write)
-			compat_btree_node(b->c.level, b->c.btree_id, version,
-					  BSET_BIG_ENDIAN(i), write, bn);
-
-		if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
-			struct bch_btree_ptr_v2 *bp =
-				&bkey_i_to_btree_ptr_v2(&b->key)->v;
-
-			if (BTREE_PTR_RANGE_UPDATED(bp)) {
-				b->data->min_key = bp->min_key;
-				b->data->max_key = b->key.k.p;
-			}
-
-			btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
-				     -BCH_ERR_btree_node_read_err_must_retry,
-				     c, ca, b, NULL,
-				     btree_node_bad_min_key,
-				     "incorrect min_key: got %s should be %s",
-				     (printbuf_reset(&buf1),
-				      bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
-				     (printbuf_reset(&buf2),
-				      bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf));
-		}
-
-		btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
-			     -BCH_ERR_btree_node_read_err_must_retry,
-			     c, ca, b, i,
-			     btree_node_bad_max_key,
-			     "incorrect max key %s",
-			     (printbuf_reset(&buf1),
-			      bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
-
-		if (write)
-			compat_btree_node(b->c.level, b->c.btree_id, version,
-					  BSET_BIG_ENDIAN(i), write, bn);
-
-		btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1),
-			     -BCH_ERR_btree_node_read_err_bad_node,
-			     c, ca, b, i,
-			     btree_node_bad_format,
-			     "invalid bkey format: %s\n  %s", buf1.buf,
-			     (printbuf_reset(&buf2),
-			      bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
-		printbuf_reset(&buf1);
-
-		compat_bformat(b->c.level, b->c.btree_id, version,
-			       BSET_BIG_ENDIAN(i), write,
-			       &bn->format);
-	}
-out:
-fsck_err:
-	printbuf_exit(&buf2);
-	printbuf_exit(&buf1);
-	return ret;
-}
-
-static int bset_key_invalid(struct bch_fs *c, struct btree *b,
-			    struct bkey_s_c k,
-			    bool updated_range, int rw,
-			    struct printbuf *err)
-{
-	return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?:
-		(!updated_range ? bch2_bkey_in_btree_node(c, b, k, err) : 0) ?:
-		(rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
-}
-
-static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
-			 struct bset *i, struct bkey_packed *k)
-{
-	if (bkey_p_next(k) > vstruct_last(i))
-		return false;
-
-	if (k->format > KEY_FORMAT_CURRENT)
-		return false;
-
-	if (!bkeyp_u64s_valid(&b->format, k))
-		return false;
-
-	struct printbuf buf = PRINTBUF;
-	struct bkey tmp;
-	struct bkey_s u = __bkey_disassemble(b, k, &tmp);
-	bool ret = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b), READ, &buf);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int validate_bset_keys(struct bch_fs *c, struct btree *b,
-			 struct bset *i, int write,
-			 bool have_retry, bool *saw_error)
-{
-	unsigned version = le16_to_cpu(i->version);
-	struct bkey_packed *k, *prev = NULL;
-	struct printbuf buf = PRINTBUF;
-	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
-	int ret = 0;
-
-	for (k = i->start;
-	     k != vstruct_last(i);) {
-		struct bkey_s u;
-		struct bkey tmp;
-		unsigned next_good_key;
-
-		if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
-				 -BCH_ERR_btree_node_read_err_fixable,
-				 c, NULL, b, i,
-				 btree_node_bkey_past_bset_end,
-				 "key extends past end of bset")) {
-			i->u64s = cpu_to_le16((u64 *) k - i->_data);
-			break;
-		}
-
-		if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
-				 -BCH_ERR_btree_node_read_err_fixable,
-				 c, NULL, b, i,
-				 btree_node_bkey_bad_format,
-				 "invalid bkey format %u", k->format))
-			goto drop_this_key;
-
-		if (btree_err_on(!bkeyp_u64s_valid(&b->format, k),
-				 -BCH_ERR_btree_node_read_err_fixable,
-				 c, NULL, b, i,
-				 btree_node_bkey_bad_u64s,
-				 "bad k->u64s %u (min %u max %zu)", k->u64s,
-				 bkeyp_key_u64s(&b->format, k),
-				 U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k)))
-			goto drop_this_key;
-
-		if (!write)
-			bch2_bkey_compat(b->c.level, b->c.btree_id, version,
-				    BSET_BIG_ENDIAN(i), write,
-				    &b->format, k);
-
-		u = __bkey_disassemble(b, k, &tmp);
-
-		printbuf_reset(&buf);
-		if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) {
-			printbuf_reset(&buf);
-			bset_key_invalid(c, b, u.s_c, updated_range, write, &buf);
-			prt_printf(&buf, "\n  ");
-			bch2_bkey_val_to_text(&buf, c, u.s_c);
-
-			btree_err(-BCH_ERR_btree_node_read_err_fixable,
-				  c, NULL, b, i,
-				  btree_node_bad_bkey,
-				  "invalid bkey: %s", buf.buf);
-			goto drop_this_key;
-		}
-
-		if (write)
-			bch2_bkey_compat(b->c.level, b->c.btree_id, version,
-				    BSET_BIG_ENDIAN(i), write,
-				    &b->format, k);
-
-		if (prev && bkey_iter_cmp(b, prev, k) > 0) {
-			struct bkey up = bkey_unpack_key(b, prev);
-
-			printbuf_reset(&buf);
-			prt_printf(&buf, "keys out of order: ");
-			bch2_bkey_to_text(&buf, &up);
-			prt_printf(&buf, " > ");
-			bch2_bkey_to_text(&buf, u.k);
-
-			if (btree_err(-BCH_ERR_btree_node_read_err_fixable,
-				      c, NULL, b, i,
-				      btree_node_bkey_out_of_order,
-				      "%s", buf.buf))
-				goto drop_this_key;
-		}
-
-		prev = k;
-		k = bkey_p_next(k);
-		continue;
-drop_this_key:
-		next_good_key = k->u64s;
-
-		if (!next_good_key ||
-		    (BSET_BIG_ENDIAN(i) == CPU_BIG_ENDIAN &&
-		     version >= bcachefs_metadata_version_snapshot)) {
-			/*
-			 * only do scanning if bch2_bkey_compat() has nothing to
-			 * do
-			 */
-
-			if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
-				for (next_good_key = 1;
-				     next_good_key < (u64 *) vstruct_last(i) - (u64 *) k;
-				     next_good_key++)
-					if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
-						goto got_good_key;
-			}
-
-			/*
-			 * didn't find a good key, have to truncate the rest of
-			 * the bset
-			 */
-			next_good_key = (u64 *) vstruct_last(i) - (u64 *) k;
-		}
-got_good_key:
-		le16_add_cpu(&i->u64s, -next_good_key);
-		memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k);
-	}
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
-			      struct btree *b, bool have_retry, bool *saw_error)
-{
-	struct btree_node_entry *bne;
-	struct sort_iter *iter;
-	struct btree_node *sorted;
-	struct bkey_packed *k;
-	struct bset *i;
-	bool used_mempool, blacklisted;
-	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
-	unsigned u64s;
-	unsigned ptr_written = btree_ptr_sectors_written(&b->key);
-	struct printbuf buf = PRINTBUF;
-	int ret = 0, retry_read = 0, write = READ;
-	u64 start_time = local_clock();
-
-	b->version_ondisk = U16_MAX;
-	/* We might get called multiple times on read retry: */
-	b->written = 0;
-
-	iter = mempool_alloc(&c->fill_iter, GFP_NOFS);
-	sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2);
-
-	if (bch2_meta_read_fault("btree"))
-		btree_err(-BCH_ERR_btree_node_read_err_must_retry,
-			  c, ca, b, NULL,
-			  btree_node_fault_injected,
-			  "dynamic fault");
-
-	btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
-		     -BCH_ERR_btree_node_read_err_must_retry,
-		     c, ca, b, NULL,
-		     btree_node_bad_magic,
-		     "bad magic: want %llx, got %llx",
-		     bset_magic(c), le64_to_cpu(b->data->magic));
-
-	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
-		struct bch_btree_ptr_v2 *bp =
-			&bkey_i_to_btree_ptr_v2(&b->key)->v;
-
-		bch2_bpos_to_text(&buf, b->data->min_key);
-		prt_str(&buf, "-");
-		bch2_bpos_to_text(&buf, b->data->max_key);
-
-		btree_err_on(b->data->keys.seq != bp->seq,
-			     -BCH_ERR_btree_node_read_err_must_retry,
-			     c, ca, b, NULL,
-			     btree_node_bad_seq,
-			     "got wrong btree node: got\n%s",
-			     (printbuf_reset(&buf),
-			      bch2_btree_node_header_to_text(&buf, b->data),
-			      buf.buf));
-	} else {
-		btree_err_on(!b->data->keys.seq,
-			     -BCH_ERR_btree_node_read_err_must_retry,
-			     c, ca, b, NULL,
-			     btree_node_bad_seq,
-			     "bad btree header: seq 0\n%s",
-			     (printbuf_reset(&buf),
-			      bch2_btree_node_header_to_text(&buf, b->data),
-			      buf.buf));
-	}
-
-	while (b->written < (ptr_written ?: btree_sectors(c))) {
-		unsigned sectors;
-		struct nonce nonce;
-		bool first = !b->written;
-		bool csum_bad;
-
-		if (!b->written) {
-			i = &b->data->keys;
-
-			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
-				     -BCH_ERR_btree_node_read_err_want_retry,
-				     c, ca, b, i,
-				     bset_unknown_csum,
-				     "unknown checksum type %llu", BSET_CSUM_TYPE(i));
-
-			nonce = btree_nonce(i, b->written << 9);
-
-			struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
-			csum_bad = bch2_crc_cmp(b->data->csum, csum);
-			if (csum_bad)
-				bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
-
-			btree_err_on(csum_bad,
-				     -BCH_ERR_btree_node_read_err_want_retry,
-				     c, ca, b, i,
-				     bset_bad_csum,
-				     "%s",
-				     (printbuf_reset(&buf),
-				      bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum),
-				      buf.buf));
-
-			ret = bset_encrypt(c, i, b->written << 9);
-			if (bch2_fs_fatal_err_on(ret, c,
-					"decrypting btree node: %s", bch2_err_str(ret)))
-				goto fsck_err;
-
-			btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
-				     !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
-				     -BCH_ERR_btree_node_read_err_incompatible,
-				     c, NULL, b, NULL,
-				     btree_node_unsupported_version,
-				     "btree node does not have NEW_EXTENT_OVERWRITE set");
-
-			sectors = vstruct_sectors(b->data, c->block_bits);
-		} else {
-			bne = write_block(b);
-			i = &bne->keys;
-
-			if (i->seq != b->data->keys.seq)
-				break;
-
-			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
-				     -BCH_ERR_btree_node_read_err_want_retry,
-				     c, ca, b, i,
-				     bset_unknown_csum,
-				     "unknown checksum type %llu", BSET_CSUM_TYPE(i));
-
-			nonce = btree_nonce(i, b->written << 9);
-			struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
-			csum_bad = bch2_crc_cmp(bne->csum, csum);
-			if (ca && csum_bad)
-				bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
-
-			btree_err_on(csum_bad,
-				     -BCH_ERR_btree_node_read_err_want_retry,
-				     c, ca, b, i,
-				     bset_bad_csum,
-				     "%s",
-				     (printbuf_reset(&buf),
-				      bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum),
-				      buf.buf));
-
-			ret = bset_encrypt(c, i, b->written << 9);
-			if (bch2_fs_fatal_err_on(ret, c,
-					"decrypting btree node: %s", bch2_err_str(ret)))
-				goto fsck_err;
-
-			sectors = vstruct_sectors(bne, c->block_bits);
-		}
-
-		b->version_ondisk = min(b->version_ondisk,
-					le16_to_cpu(i->version));
-
-		ret = validate_bset(c, ca, b, i, b->written, sectors,
-				    READ, have_retry, saw_error);
-		if (ret)
-			goto fsck_err;
-
-		if (!b->written)
-			btree_node_set_format(b, b->data->format);
-
-		ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error);
-		if (ret)
-			goto fsck_err;
-
-		SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
-
-		blacklisted = bch2_journal_seq_is_blacklisted(c,
-					le64_to_cpu(i->journal_seq),
-					true);
-
-		btree_err_on(blacklisted && first,
-			     -BCH_ERR_btree_node_read_err_fixable,
-			     c, ca, b, i,
-			     bset_blacklisted_journal_seq,
-			     "first btree node bset has blacklisted journal seq (%llu)",
-			     le64_to_cpu(i->journal_seq));
-
-		btree_err_on(blacklisted && ptr_written,
-			     -BCH_ERR_btree_node_read_err_fixable,
-			     c, ca, b, i,
-			     first_bset_blacklisted_journal_seq,
-			     "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
-			     le64_to_cpu(i->journal_seq),
-			     b->written, b->written + sectors, ptr_written);
-
-		b->written += sectors;
-
-		if (blacklisted && !first)
-			continue;
-
-		sort_iter_add(iter,
-			      vstruct_idx(i, 0),
-			      vstruct_last(i));
-	}
-
-	if (ptr_written) {
-		btree_err_on(b->written < ptr_written,
-			     -BCH_ERR_btree_node_read_err_want_retry,
-			     c, ca, b, NULL,
-			     btree_node_data_missing,
-			     "btree node data missing: expected %u sectors, found %u",
-			     ptr_written, b->written);
-	} else {
-		for (bne = write_block(b);
-		     bset_byte_offset(b, bne) < btree_buf_bytes(b);
-		     bne = (void *) bne + block_bytes(c))
-			btree_err_on(bne->keys.seq == b->data->keys.seq &&
-				     !bch2_journal_seq_is_blacklisted(c,
-								      le64_to_cpu(bne->keys.journal_seq),
-								      true),
-				     -BCH_ERR_btree_node_read_err_want_retry,
-				     c, ca, b, NULL,
-				     btree_node_bset_after_end,
-				     "found bset signature after last bset");
-	}
-
-	sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool);
-	sorted->keys.u64s = 0;
-
-	set_btree_bset(b, b->set, &b->data->keys);
-
-	b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
-
-	u64s = le16_to_cpu(sorted->keys.u64s);
-	*sorted = *b->data;
-	sorted->keys.u64s = cpu_to_le16(u64s);
-	swap(sorted, b->data);
-	set_btree_bset(b, b->set, &b->data->keys);
-	b->nsets = 1;
-
-	BUG_ON(b->nr.live_u64s != u64s);
-
-	btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted);
-
-	if (updated_range)
-		bch2_btree_node_drop_keys_outside_node(b);
-
-	i = &b->data->keys;
-	for (k = i->start; k != vstruct_last(i);) {
-		struct bkey tmp;
-		struct bkey_s u = __bkey_disassemble(b, k, &tmp);
-
-		printbuf_reset(&buf);
-
-		if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) ||
-		    (bch2_inject_invalid_keys &&
-		     !bversion_cmp(u.k->version, MAX_VERSION))) {
-			printbuf_reset(&buf);
-
-			prt_printf(&buf, "invalid bkey: ");
-			bch2_bkey_val_invalid(c, u.s_c, READ, &buf);
-			prt_printf(&buf, "\n  ");
-			bch2_bkey_val_to_text(&buf, c, u.s_c);
-
-			btree_err(-BCH_ERR_btree_node_read_err_fixable,
-				  c, NULL, b, i,
-				  btree_node_bad_bkey,
-				  "%s", buf.buf);
-
-			btree_keys_account_key_drop(&b->nr, 0, k);
-
-			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
-			memmove_u64s_down(k, bkey_p_next(k),
-					  (u64 *) vstruct_end(i) - (u64 *) k);
-			set_btree_bset_end(b, b->set);
-			continue;
-		}
-
-		if (u.k->type == KEY_TYPE_btree_ptr_v2) {
-			struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u);
-
-			bp.v->mem_ptr = 0;
-		}
-
-		k = bkey_p_next(k);
-	}
-
-	bch2_bset_build_aux_tree(b, b->set, false);
-
-	set_needs_whiteout(btree_bset_first(b), true);
-
-	btree_node_reset_sib_u64s(b);
-
-	rcu_read_lock();
-	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
-		struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
-
-		if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw)
-			set_btree_node_need_rewrite(b);
-	}
-	rcu_read_unlock();
-
-	if (!ptr_written)
-		set_btree_node_need_rewrite(b);
-out:
-	mempool_free(iter, &c->fill_iter);
-	printbuf_exit(&buf);
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
-	return retry_read;
-fsck_err:
-	if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
-	    ret == -BCH_ERR_btree_node_read_err_must_retry) {
-		retry_read = 1;
-	} else {
-		set_btree_node_read_error(b);
-		bch2_btree_lost_data(c, b->c.btree_id);
-	}
-	goto out;
-}
-
-static void btree_node_read_work(struct work_struct *work)
-{
-	struct btree_read_bio *rb =
-		container_of(work, struct btree_read_bio, work);
-	struct bch_fs *c	= rb->c;
-	struct bch_dev *ca	= rb->have_ioref ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
-	struct btree *b		= rb->b;
-	struct bio *bio		= &rb->bio;
-	struct bch_io_failures failed = { .nr = 0 };
-	struct printbuf buf = PRINTBUF;
-	bool saw_error = false;
-	bool retry = false;
-	bool can_retry;
-
-	goto start;
-	while (1) {
-		retry = true;
-		bch_info(c, "retrying read");
-		ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ);
-		rb->have_ioref		= ca != NULL;
-		bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
-		bio->bi_iter.bi_sector	= rb->pick.ptr.offset;
-		bio->bi_iter.bi_size	= btree_buf_bytes(b);
-
-		if (rb->have_ioref) {
-			bio_set_dev(bio, ca->disk_sb.bdev);
-			submit_bio_wait(bio);
-		} else {
-			bio->bi_status = BLK_STS_REMOVED;
-		}
-start:
-		printbuf_reset(&buf);
-		bch2_btree_pos_to_text(&buf, c, b);
-		bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read,
-				   "btree read error %s for %s",
-				   bch2_blk_status_to_str(bio->bi_status), buf.buf);
-		if (rb->have_ioref)
-			percpu_ref_put(&ca->io_ref);
-		rb->have_ioref = false;
-
-		bch2_mark_io_failure(&failed, &rb->pick);
-
-		can_retry = bch2_bkey_pick_read_device(c,
-				bkey_i_to_s_c(&b->key),
-				&failed, &rb->pick) > 0;
-
-		if (!bio->bi_status &&
-		    !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) {
-			if (retry)
-				bch_info(c, "retry success");
-			break;
-		}
-
-		saw_error = true;
-
-		if (!can_retry) {
-			set_btree_node_read_error(b);
-			bch2_btree_lost_data(c, b->c.btree_id);
-			break;
-		}
-	}
-
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
-			       rb->start_time);
-	bio_put(&rb->bio);
-
-	if (saw_error &&
-	    !btree_node_read_error(b) &&
-	    c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
-		printbuf_reset(&buf);
-		bch2_bpos_to_text(&buf, b->key.k.p);
-		bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
-			 __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf);
-
-		bch2_btree_node_rewrite_async(c, b);
-	}
-
-	printbuf_exit(&buf);
-	clear_btree_node_read_in_flight(b);
-	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
-}
-
-static void btree_node_read_endio(struct bio *bio)
-{
-	struct btree_read_bio *rb =
-		container_of(bio, struct btree_read_bio, bio);
-	struct bch_fs *c	= rb->c;
-
-	if (rb->have_ioref) {
-		struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
-
-		bch2_latency_acct(ca, rb->start_time, READ);
-	}
-
-	queue_work(c->io_complete_wq, &rb->work);
-}
-
-struct btree_node_read_all {
-	struct closure		cl;
-	struct bch_fs		*c;
-	struct btree		*b;
-	unsigned		nr;
-	void			*buf[BCH_REPLICAS_MAX];
-	struct bio		*bio[BCH_REPLICAS_MAX];
-	blk_status_t		err[BCH_REPLICAS_MAX];
-};
-
-static unsigned btree_node_sectors_written(struct bch_fs *c, void *data)
-{
-	struct btree_node *bn = data;
-	struct btree_node_entry *bne;
-	unsigned offset = 0;
-
-	if (le64_to_cpu(bn->magic) !=  bset_magic(c))
-		return 0;
-
-	while (offset < btree_sectors(c)) {
-		if (!offset) {
-			offset += vstruct_sectors(bn, c->block_bits);
-		} else {
-			bne = data + (offset << 9);
-			if (bne->keys.seq != bn->keys.seq)
-				break;
-			offset += vstruct_sectors(bne, c->block_bits);
-		}
-	}
-
-	return offset;
-}
-
-static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data)
-{
-	struct btree_node *bn = data;
-	struct btree_node_entry *bne;
-
-	if (!offset)
-		return false;
-
-	while (offset < btree_sectors(c)) {
-		bne = data + (offset << 9);
-		if (bne->keys.seq == bn->keys.seq)
-			return true;
-		offset++;
-	}
-
-	return false;
-	return offset;
-}
-
-static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
-{
-	closure_type(ra, struct btree_node_read_all, cl);
-	struct bch_fs *c = ra->c;
-	struct btree *b = ra->b;
-	struct printbuf buf = PRINTBUF;
-	bool dump_bset_maps = false;
-	bool have_retry = false;
-	int ret = 0, best = -1, write = READ;
-	unsigned i, written = 0, written2 = 0;
-	__le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
-		? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
-	bool _saw_error = false, *saw_error = &_saw_error;
-
-	for (i = 0; i < ra->nr; i++) {
-		struct btree_node *bn = ra->buf[i];
-
-		if (ra->err[i])
-			continue;
-
-		if (le64_to_cpu(bn->magic) != bset_magic(c) ||
-		    (seq && seq != bn->keys.seq))
-			continue;
-
-		if (best < 0) {
-			best = i;
-			written = btree_node_sectors_written(c, bn);
-			continue;
-		}
-
-		written2 = btree_node_sectors_written(c, ra->buf[i]);
-		if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable,
-				 c, NULL, b, NULL,
-				 btree_node_replicas_sectors_written_mismatch,
-				 "btree node sectors written mismatch: %u != %u",
-				 written, written2) ||
-		    btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
-				 -BCH_ERR_btree_node_read_err_fixable,
-				 c, NULL, b, NULL,
-				 btree_node_bset_after_end,
-				 "found bset signature after last bset") ||
-		    btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
-				 -BCH_ERR_btree_node_read_err_fixable,
-				 c, NULL, b, NULL,
-				 btree_node_replicas_data_mismatch,
-				 "btree node replicas content mismatch"))
-			dump_bset_maps = true;
-
-		if (written2 > written) {
-			written = written2;
-			best = i;
-		}
-	}
-fsck_err:
-	if (dump_bset_maps) {
-		for (i = 0; i < ra->nr; i++) {
-			struct btree_node *bn = ra->buf[i];
-			struct btree_node_entry *bne = NULL;
-			unsigned offset = 0, sectors;
-			bool gap = false;
-
-			if (ra->err[i])
-				continue;
-
-			printbuf_reset(&buf);
-
-			while (offset < btree_sectors(c)) {
-				if (!offset) {
-					sectors = vstruct_sectors(bn, c->block_bits);
-				} else {
-					bne = ra->buf[i] + (offset << 9);
-					if (bne->keys.seq != bn->keys.seq)
-						break;
-					sectors = vstruct_sectors(bne, c->block_bits);
-				}
-
-				prt_printf(&buf, " %u-%u", offset, offset + sectors);
-				if (bne && bch2_journal_seq_is_blacklisted(c,
-							le64_to_cpu(bne->keys.journal_seq), false))
-					prt_printf(&buf, "*");
-				offset += sectors;
-			}
-
-			while (offset < btree_sectors(c)) {
-				bne = ra->buf[i] + (offset << 9);
-				if (bne->keys.seq == bn->keys.seq) {
-					if (!gap)
-						prt_printf(&buf, " GAP");
-					gap = true;
-
-					sectors = vstruct_sectors(bne, c->block_bits);
-					prt_printf(&buf, " %u-%u", offset, offset + sectors);
-					if (bch2_journal_seq_is_blacklisted(c,
-							le64_to_cpu(bne->keys.journal_seq), false))
-						prt_printf(&buf, "*");
-				}
-				offset++;
-			}
-
-			bch_err(c, "replica %u:%s", i, buf.buf);
-		}
-	}
-
-	if (best >= 0) {
-		memcpy(b->data, ra->buf[best], btree_buf_bytes(b));
-		ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error);
-	} else {
-		ret = -1;
-	}
-
-	if (ret) {
-		set_btree_node_read_error(b);
-		bch2_btree_lost_data(c, b->c.btree_id);
-	} else if (*saw_error)
-		bch2_btree_node_rewrite_async(c, b);
-
-	for (i = 0; i < ra->nr; i++) {
-		mempool_free(ra->buf[i], &c->btree_bounce_pool);
-		bio_put(ra->bio[i]);
-	}
-
-	closure_debug_destroy(&ra->cl);
-	kfree(ra);
-	printbuf_exit(&buf);
-
-	clear_btree_node_read_in_flight(b);
-	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
-}
-
-static void btree_node_read_all_replicas_endio(struct bio *bio)
-{
-	struct btree_read_bio *rb =
-		container_of(bio, struct btree_read_bio, bio);
-	struct bch_fs *c	= rb->c;
-	struct btree_node_read_all *ra = rb->ra;
-
-	if (rb->have_ioref) {
-		struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
-
-		bch2_latency_acct(ca, rb->start_time, READ);
-	}
-
-	ra->err[rb->idx] = bio->bi_status;
-	closure_put(&ra->cl);
-}
-
-/*
- * XXX This allocates multiple times from the same mempools, and can deadlock
- * under sufficient memory pressure (but is only a debug path)
- */
-static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync)
-{
-	struct bkey_s_c k = bkey_i_to_s_c(&b->key);
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded pick;
-	struct btree_node_read_all *ra;
-	unsigned i;
-
-	ra = kzalloc(sizeof(*ra), GFP_NOFS);
-	if (!ra)
-		return -BCH_ERR_ENOMEM_btree_node_read_all_replicas;
-
-	closure_init(&ra->cl, NULL);
-	ra->c	= c;
-	ra->b	= b;
-	ra->nr	= bch2_bkey_nr_ptrs(k);
-
-	for (i = 0; i < ra->nr; i++) {
-		ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
-		ra->bio[i] = bio_alloc_bioset(NULL,
-					      buf_pages(ra->buf[i], btree_buf_bytes(b)),
-					      REQ_OP_READ|REQ_SYNC|REQ_META,
-					      GFP_NOFS,
-					      &c->btree_bio);
-	}
-
-	i = 0;
-	bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) {
-		struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
-		struct btree_read_bio *rb =
-			container_of(ra->bio[i], struct btree_read_bio, bio);
-		rb->c			= c;
-		rb->b			= b;
-		rb->ra			= ra;
-		rb->start_time		= local_clock();
-		rb->have_ioref		= ca != NULL;
-		rb->idx			= i;
-		rb->pick		= pick;
-		rb->bio.bi_iter.bi_sector = pick.ptr.offset;
-		rb->bio.bi_end_io	= btree_node_read_all_replicas_endio;
-		bch2_bio_map(&rb->bio, ra->buf[i], btree_buf_bytes(b));
-
-		if (rb->have_ioref) {
-			this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
-				     bio_sectors(&rb->bio));
-			bio_set_dev(&rb->bio, ca->disk_sb.bdev);
-
-			closure_get(&ra->cl);
-			submit_bio(&rb->bio);
-		} else {
-			ra->err[i] = BLK_STS_REMOVED;
-		}
-
-		i++;
-	}
-
-	if (sync) {
-		closure_sync(&ra->cl);
-		btree_node_read_all_replicas_done(&ra->cl.work);
-	} else {
-		continue_at(&ra->cl, btree_node_read_all_replicas_done,
-			    c->io_complete_wq);
-	}
-
-	return 0;
-}
-
-void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
-			  bool sync)
-{
-	struct bch_fs *c = trans->c;
-	struct extent_ptr_decoded pick;
-	struct btree_read_bio *rb;
-	struct bch_dev *ca;
-	struct bio *bio;
-	int ret;
-
-	trace_and_count(c, btree_node_read, trans, b);
-
-	if (bch2_verify_all_btree_replicas &&
-	    !btree_node_read_all_replicas(c, b, sync))
-		return;
-
-	ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
-					 NULL, &pick);
-
-	if (ret <= 0) {
-		struct printbuf buf = PRINTBUF;
-
-		prt_str(&buf, "btree node read error: no device to read from\n at ");
-		bch2_btree_pos_to_text(&buf, c, b);
-		bch_err_ratelimited(c, "%s", buf.buf);
-
-		if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
-		    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)
-			bch2_fatal_error(c);
-
-		set_btree_node_read_error(b);
-		bch2_btree_lost_data(c, b->c.btree_id);
-		clear_btree_node_read_in_flight(b);
-		wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
-		printbuf_exit(&buf);
-		return;
-	}
-
-	ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
-
-	bio = bio_alloc_bioset(NULL,
-			       buf_pages(b->data, btree_buf_bytes(b)),
-			       REQ_OP_READ|REQ_SYNC|REQ_META,
-			       GFP_NOFS,
-			       &c->btree_bio);
-	rb = container_of(bio, struct btree_read_bio, bio);
-	rb->c			= c;
-	rb->b			= b;
-	rb->ra			= NULL;
-	rb->start_time		= local_clock();
-	rb->have_ioref		= ca != NULL;
-	rb->pick		= pick;
-	INIT_WORK(&rb->work, btree_node_read_work);
-	bio->bi_iter.bi_sector	= pick.ptr.offset;
-	bio->bi_end_io		= btree_node_read_endio;
-	bch2_bio_map(bio, b->data, btree_buf_bytes(b));
-
-	if (rb->have_ioref) {
-		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
-			     bio_sectors(bio));
-		bio_set_dev(bio, ca->disk_sb.bdev);
-
-		if (sync) {
-			submit_bio_wait(bio);
-			bch2_latency_acct(ca, rb->start_time, READ);
-			btree_node_read_work(&rb->work);
-		} else {
-			submit_bio(bio);
-		}
-	} else {
-		bio->bi_status = BLK_STS_REMOVED;
-
-		if (sync)
-			btree_node_read_work(&rb->work);
-		else
-			queue_work(c->io_complete_wq, &rb->work);
-	}
-}
-
-static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
-				  const struct bkey_i *k, unsigned level)
-{
-	struct bch_fs *c = trans->c;
-	struct closure cl;
-	struct btree *b;
-	int ret;
-
-	closure_init_stack(&cl);
-
-	do {
-		ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
-		closure_sync(&cl);
-	} while (ret);
-
-	b = bch2_btree_node_mem_alloc(trans, level != 0);
-	bch2_btree_cache_cannibalize_unlock(trans);
-
-	BUG_ON(IS_ERR(b));
-
-	bkey_copy(&b->key, k);
-	BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
-
-	set_btree_node_read_in_flight(b);
-
-	bch2_btree_node_read(trans, b, true);
-
-	if (btree_node_read_error(b)) {
-		bch2_btree_node_hash_remove(&c->btree_cache, b);
-
-		mutex_lock(&c->btree_cache.lock);
-		list_move(&b->list, &c->btree_cache.freeable);
-		mutex_unlock(&c->btree_cache.lock);
-
-		ret = -BCH_ERR_btree_node_read_error;
-		goto err;
-	}
-
-	bch2_btree_set_root_for_read(c, b);
-err:
-	six_unlock_write(&b->c.lock);
-	six_unlock_intent(&b->c.lock);
-
-	return ret;
-}
-
-int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
-			const struct bkey_i *k, unsigned level)
-{
-	return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
-}
-
-static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
-				      struct btree_write *w)
-{
-	unsigned long old, new, v = READ_ONCE(b->will_make_reachable);
-
-	do {
-		old = new = v;
-		if (!(old & 1))
-			break;
-
-		new &= ~1UL;
-	} while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old);
-
-	if (old & 1)
-		closure_put(&((struct btree_update *) new)->cl);
-
-	bch2_journal_pin_drop(&c->journal, &w->journal);
-}
-
-static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
-{
-	struct btree_write *w = btree_prev_write(b);
-	unsigned long old, new, v;
-	unsigned type = 0;
-
-	bch2_btree_complete_write(c, b, w);
-
-	v = READ_ONCE(b->flags);
-	do {
-		old = new = v;
-
-		if ((old & (1U << BTREE_NODE_dirty)) &&
-		    (old & (1U << BTREE_NODE_need_write)) &&
-		    !(old & (1U << BTREE_NODE_never_write)) &&
-		    !(old & (1U << BTREE_NODE_write_blocked)) &&
-		    !(old & (1U << BTREE_NODE_will_make_reachable))) {
-			new &= ~(1U << BTREE_NODE_dirty);
-			new &= ~(1U << BTREE_NODE_need_write);
-			new |=  (1U << BTREE_NODE_write_in_flight);
-			new |=  (1U << BTREE_NODE_write_in_flight_inner);
-			new |=  (1U << BTREE_NODE_just_written);
-			new ^=  (1U << BTREE_NODE_write_idx);
-
-			type = new & BTREE_WRITE_TYPE_MASK;
-			new &= ~BTREE_WRITE_TYPE_MASK;
-		} else {
-			new &= ~(1U << BTREE_NODE_write_in_flight);
-			new &= ~(1U << BTREE_NODE_write_in_flight_inner);
-		}
-	} while ((v = cmpxchg(&b->flags, old, new)) != old);
-
-	if (new & (1U << BTREE_NODE_write_in_flight))
-		__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type);
-	else
-		wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
-}
-
-static void btree_node_write_done(struct bch_fs *c, struct btree *b)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-
-	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-	__btree_node_write_done(c, b);
-	six_unlock_read(&b->c.lock);
-
-	bch2_trans_put(trans);
-}
-
-static void btree_node_write_work(struct work_struct *work)
-{
-	struct btree_write_bio *wbio =
-		container_of(work, struct btree_write_bio, work);
-	struct bch_fs *c	= wbio->wbio.c;
-	struct btree *b		= wbio->wbio.bio.bi_private;
-	int ret = 0;
-
-	btree_bounce_free(c,
-		wbio->data_bytes,
-		wbio->wbio.used_mempool,
-		wbio->data);
-
-	bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
-		bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
-
-	if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
-		ret = -BCH_ERR_btree_node_write_all_failed;
-		goto err;
-	}
-
-	if (wbio->wbio.first_btree_write) {
-		if (wbio->wbio.failed.nr) {
-
-		}
-	} else {
-		ret = bch2_trans_do(c, NULL, NULL, 0,
-			bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
-					BCH_WATERMARK_interior_updates|
-					BCH_TRANS_COMMIT_journal_reclaim|
-					BCH_TRANS_COMMIT_no_enospc|
-					BCH_TRANS_COMMIT_no_check_rw,
-					!wbio->wbio.failed.nr));
-		if (ret)
-			goto err;
-	}
-out:
-	bio_put(&wbio->wbio.bio);
-	btree_node_write_done(c, b);
-	return;
-err:
-	set_btree_node_noevict(b);
-	bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
-			     "writing btree node: %s", bch2_err_str(ret));
-	goto out;
-}
-
-static void btree_node_write_endio(struct bio *bio)
-{
-	struct bch_write_bio *wbio	= to_wbio(bio);
-	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
-	struct bch_write_bio *orig	= parent ?: wbio;
-	struct btree_write_bio *wb	= container_of(orig, struct btree_write_bio, wbio);
-	struct bch_fs *c		= wbio->c;
-	struct btree *b			= wbio->bio.bi_private;
-	struct bch_dev *ca		= wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL;
-	unsigned long flags;
-
-	if (wbio->have_ioref)
-		bch2_latency_acct(ca, wbio->submit_time, WRITE);
-
-	if (!ca ||
-	    bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
-			       "btree write error: %s",
-			       bch2_blk_status_to_str(bio->bi_status)) ||
-	    bch2_meta_write_fault("btree")) {
-		spin_lock_irqsave(&c->btree_write_error_lock, flags);
-		bch2_dev_list_add_dev(&orig->failed, wbio->dev);
-		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
-	}
-
-	if (wbio->have_ioref)
-		percpu_ref_put(&ca->io_ref);
-
-	if (parent) {
-		bio_put(bio);
-		bio_endio(&parent->bio);
-		return;
-	}
-
-	clear_btree_node_write_in_flight_inner(b);
-	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner);
-	INIT_WORK(&wb->work, btree_node_write_work);
-	queue_work(c->btree_io_complete_wq, &wb->work);
-}
-
-static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
-				   struct bset *i, unsigned sectors)
-{
-	struct printbuf buf = PRINTBUF;
-	bool saw_error;
-	int ret;
-
-	ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key),
-				BKEY_TYPE_btree, WRITE, &buf);
-
-	if (ret)
-		bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf);
-	printbuf_exit(&buf);
-	if (ret)
-		return ret;
-
-	ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?:
-		validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error);
-	if (ret) {
-		bch2_inconsistent_error(c);
-		dump_stack();
-	}
-
-	return ret;
-}
-
-static void btree_write_submit(struct work_struct *work)
-{
-	struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
-	BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
-
-	bkey_copy(&tmp.k, &wbio->key);
-
-	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr)
-		ptr->offset += wbio->sector_offset;
-
-	bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree,
-				  &tmp.k, false);
-}
-
-void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
-{
-	struct btree_write_bio *wbio;
-	struct bset *i;
-	struct btree_node *bn = NULL;
-	struct btree_node_entry *bne = NULL;
-	struct sort_iter_stack sort_iter;
-	struct nonce nonce;
-	unsigned bytes_to_write, sectors_to_write, bytes, u64s;
-	u64 seq = 0;
-	bool used_mempool;
-	unsigned long old, new;
-	bool validate_before_checksum = false;
-	enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK;
-	void *data;
-	int ret;
-
-	if (flags & BTREE_WRITE_ALREADY_STARTED)
-		goto do_write;
-
-	/*
-	 * We may only have a read lock on the btree node - the dirty bit is our
-	 * "lock" against racing with other threads that may be trying to start
-	 * a write, we do a write iff we clear the dirty bit. Since setting the
-	 * dirty bit requires a write lock, we can't race with other threads
-	 * redirtying it:
-	 */
-	do {
-		old = new = READ_ONCE(b->flags);
-
-		if (!(old & (1 << BTREE_NODE_dirty)))
-			return;
-
-		if ((flags & BTREE_WRITE_ONLY_IF_NEED) &&
-		    !(old & (1 << BTREE_NODE_need_write)))
-			return;
-
-		if (old &
-		    ((1 << BTREE_NODE_never_write)|
-		     (1 << BTREE_NODE_write_blocked)))
-			return;
-
-		if (b->written &&
-		    (old & (1 << BTREE_NODE_will_make_reachable)))
-			return;
-
-		if (old & (1 << BTREE_NODE_write_in_flight))
-			return;
-
-		if (flags & BTREE_WRITE_ONLY_IF_NEED)
-			type = new & BTREE_WRITE_TYPE_MASK;
-		new &= ~BTREE_WRITE_TYPE_MASK;
-
-		new &= ~(1 << BTREE_NODE_dirty);
-		new &= ~(1 << BTREE_NODE_need_write);
-		new |=  (1 << BTREE_NODE_write_in_flight);
-		new |=  (1 << BTREE_NODE_write_in_flight_inner);
-		new |=  (1 << BTREE_NODE_just_written);
-		new ^=  (1 << BTREE_NODE_write_idx);
-	} while (cmpxchg_acquire(&b->flags, old, new) != old);
-
-	if (new & (1U << BTREE_NODE_need_write))
-		return;
-do_write:
-	BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
-
-	atomic_dec(&c->btree_cache.dirty);
-
-	BUG_ON(btree_node_fake(b));
-	BUG_ON((b->will_make_reachable != 0) != !b->written);
-
-	BUG_ON(b->written >= btree_sectors(c));
-	BUG_ON(b->written & (block_sectors(c) - 1));
-	BUG_ON(bset_written(b, btree_bset_last(b)));
-	BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
-	BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
-
-	bch2_sort_whiteouts(c, b);
-
-	sort_iter_stack_init(&sort_iter, b);
-
-	bytes = !b->written
-		? sizeof(struct btree_node)
-		: sizeof(struct btree_node_entry);
-
-	bytes += b->whiteout_u64s * sizeof(u64);
-
-	for_each_bset(b, t) {
-		i = bset(b, t);
-
-		if (bset_written(b, i))
-			continue;
-
-		bytes += le16_to_cpu(i->u64s) * sizeof(u64);
-		sort_iter_add(&sort_iter.iter,
-			      btree_bkey_first(b, t),
-			      btree_bkey_last(b, t));
-		seq = max(seq, le64_to_cpu(i->journal_seq));
-	}
-
-	BUG_ON(b->written && !seq);
-
-	/* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
-	bytes += 8;
-
-	/* buffer must be a multiple of the block size */
-	bytes = round_up(bytes, block_bytes(c));
-
-	data = btree_bounce_alloc(c, bytes, &used_mempool);
-
-	if (!b->written) {
-		bn = data;
-		*bn = *b->data;
-		i = &bn->keys;
-	} else {
-		bne = data;
-		bne->keys = b->data->keys;
-		i = &bne->keys;
-	}
-
-	i->journal_seq	= cpu_to_le64(seq);
-	i->u64s		= 0;
-
-	sort_iter_add(&sort_iter.iter,
-		      unwritten_whiteouts_start(b),
-		      unwritten_whiteouts_end(b));
-	SET_BSET_SEPARATE_WHITEOUTS(i, false);
-
-	u64s = bch2_sort_keys_keep_unwritten_whiteouts(i->start, &sort_iter.iter);
-	le16_add_cpu(&i->u64s, u64s);
-
-	b->whiteout_u64s = 0;
-
-	BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
-
-	set_needs_whiteout(i, false);
-
-	/* do we have data to write? */
-	if (b->written && !i->u64s)
-		goto nowrite;
-
-	bytes_to_write = vstruct_end(i) - data;
-	sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
-
-	if (!b->written &&
-	    b->key.k.type == KEY_TYPE_btree_ptr_v2)
-		BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write);
-
-	memset(data + bytes_to_write, 0,
-	       (sectors_to_write << 9) - bytes_to_write);
-
-	BUG_ON(b->written + sectors_to_write > btree_sectors(c));
-	BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
-	BUG_ON(i->seq != b->data->keys.seq);
-
-	i->version = cpu_to_le16(c->sb.version);
-	SET_BSET_OFFSET(i, b->written);
-	SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
-
-	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)))
-		validate_before_checksum = true;
-
-	/* validate_bset will be modifying: */
-	if (le16_to_cpu(i->version) < bcachefs_metadata_version_current)
-		validate_before_checksum = true;
-
-	/* if we're going to be encrypting, check metadata validity first: */
-	if (validate_before_checksum &&
-	    validate_bset_for_write(c, b, i, sectors_to_write))
-		goto err;
-
-	ret = bset_encrypt(c, i, b->written << 9);
-	if (bch2_fs_fatal_err_on(ret, c,
-			"encrypting btree node: %s", bch2_err_str(ret)))
-		goto err;
-
-	nonce = btree_nonce(i, b->written << 9);
-
-	if (bn)
-		bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
-	else
-		bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
-
-	/* if we're not encrypting, check metadata after checksumming: */
-	if (!validate_before_checksum &&
-	    validate_bset_for_write(c, b, i, sectors_to_write))
-		goto err;
-
-	/*
-	 * We handle btree write errors by immediately halting the journal -
-	 * after we've done that, we can't issue any subsequent btree writes
-	 * because they might have pointers to new nodes that failed to write.
-	 *
-	 * Furthermore, there's no point in doing any more btree writes because
-	 * with the journal stopped, we're never going to update the journal to
-	 * reflect that those writes were done and the data flushed from the
-	 * journal:
-	 *
-	 * Also on journal error, the pending write may have updates that were
-	 * never journalled (interior nodes, see btree_update_nodes_written()) -
-	 * it's critical that we don't do the write in that case otherwise we
-	 * will have updates visible that weren't in the journal:
-	 *
-	 * Make sure to update b->written so bch2_btree_init_next() doesn't
-	 * break:
-	 */
-	if (bch2_journal_error(&c->journal) ||
-	    c->opts.nochanges)
-		goto err;
-
-	trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write);
-
-	wbio = container_of(bio_alloc_bioset(NULL,
-				buf_pages(data, sectors_to_write << 9),
-				REQ_OP_WRITE|REQ_META,
-				GFP_NOFS,
-				&c->btree_bio),
-			    struct btree_write_bio, wbio.bio);
-	wbio_init(&wbio->wbio.bio);
-	wbio->data			= data;
-	wbio->data_bytes		= bytes;
-	wbio->sector_offset		= b->written;
-	wbio->wbio.c			= c;
-	wbio->wbio.used_mempool		= used_mempool;
-	wbio->wbio.first_btree_write	= !b->written;
-	wbio->wbio.bio.bi_end_io	= btree_node_write_endio;
-	wbio->wbio.bio.bi_private	= b;
-
-	bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
-
-	bkey_copy(&wbio->key, &b->key);
-
-	b->written += sectors_to_write;
-
-	if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2)
-		bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
-			cpu_to_le16(b->written);
-
-	atomic64_inc(&c->btree_write_stats[type].nr);
-	atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
-
-	INIT_WORK(&wbio->work, btree_write_submit);
-	queue_work(c->io_complete_wq, &wbio->work);
-	return;
-err:
-	set_btree_node_noevict(b);
-	b->written += sectors_to_write;
-nowrite:
-	btree_bounce_free(c, bytes, used_mempool, data);
-	__btree_node_write_done(c, b);
-}
-
-/*
- * Work that must be done with write lock held:
- */
-bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
-{
-	bool invalidated_iter = false;
-	struct btree_node_entry *bne;
-
-	if (!btree_node_just_written(b))
-		return false;
-
-	BUG_ON(b->whiteout_u64s);
-
-	clear_btree_node_just_written(b);
-
-	/*
-	 * Note: immediately after write, bset_written() doesn't work - the
-	 * amount of data we had to write after compaction might have been
-	 * smaller than the offset of the last bset.
-	 *
-	 * However, we know that all bsets have been written here, as long as
-	 * we're still holding the write lock:
-	 */
-
-	/*
-	 * XXX: decide if we really want to unconditionally sort down to a
-	 * single bset:
-	 */
-	if (b->nsets > 1) {
-		btree_node_sort(c, b, 0, b->nsets);
-		invalidated_iter = true;
-	} else {
-		invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL);
-	}
-
-	for_each_bset(b, t)
-		set_needs_whiteout(bset(b, t), true);
-
-	bch2_btree_verify(c, b);
-
-	/*
-	 * If later we don't unconditionally sort down to a single bset, we have
-	 * to ensure this is still true:
-	 */
-	BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b));
-
-	bne = want_new_bset(c, b);
-	if (bne)
-		bch2_bset_init_next(b, bne);
-
-	bch2_btree_build_aux_trees(b);
-
-	return invalidated_iter;
-}
-
-/*
- * Use this one if the node is intent locked:
- */
-void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
-			   enum six_lock_type lock_type_held,
-			   unsigned flags)
-{
-	if (lock_type_held == SIX_LOCK_intent ||
-	    (lock_type_held == SIX_LOCK_read &&
-	     six_lock_tryupgrade(&b->c.lock))) {
-		__bch2_btree_node_write(c, b, flags);
-
-		/* don't cycle lock unnecessarily: */
-		if (btree_node_just_written(b) &&
-		    six_trylock_write(&b->c.lock)) {
-			bch2_btree_post_write_cleanup(c, b);
-			six_unlock_write(&b->c.lock);
-		}
-
-		if (lock_type_held == SIX_LOCK_read)
-			six_lock_downgrade(&b->c.lock);
-	} else {
-		__bch2_btree_node_write(c, b, flags);
-		if (lock_type_held == SIX_LOCK_write &&
-		    btree_node_just_written(b))
-			bch2_btree_post_write_cleanup(c, b);
-	}
-}
-
-static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
-{
-	struct bucket_table *tbl;
-	struct rhash_head *pos;
-	struct btree *b;
-	unsigned i;
-	bool ret = false;
-restart:
-	rcu_read_lock();
-	for_each_cached_btree(b, c, tbl, i, pos)
-		if (test_bit(flag, &b->flags)) {
-			rcu_read_unlock();
-			wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
-			ret = true;
-			goto restart;
-		}
-	rcu_read_unlock();
-
-	return ret;
-}
-
-bool bch2_btree_flush_all_reads(struct bch_fs *c)
-{
-	return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
-}
-
-bool bch2_btree_flush_all_writes(struct bch_fs *c)
-{
-	return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
-}
-
-static const char * const bch2_btree_write_types[] = {
-#define x(t, n) [n] = #t,
-	BCH_BTREE_WRITE_TYPES()
-	NULL
-};
-
-void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	printbuf_tabstop_push(out, 20);
-	printbuf_tabstop_push(out, 10);
-
-	prt_printf(out, "\tnr\tsize\n");
-
-	for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) {
-		u64 nr		= atomic64_read(&c->btree_write_stats[i].nr);
-		u64 bytes	= atomic64_read(&c->btree_write_stats[i].bytes);
-
-		prt_printf(out, "%s:\t%llu\t", bch2_btree_write_types[i], nr);
-		prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0);
-		prt_newline(out);
-	}
-}
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
deleted file mode 100644
index 2b8b564fc560..000000000000
--- a/fs/bcachefs/btree_io.h
+++ /dev/null
@@ -1,223 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_IO_H
-#define _BCACHEFS_BTREE_IO_H
-
-#include "bkey_methods.h"
-#include "bset.h"
-#include "btree_locking.h"
-#include "checksum.h"
-#include "extents.h"
-#include "io_write_types.h"
-
-struct bch_fs;
-struct btree_write;
-struct btree;
-struct btree_iter;
-struct btree_node_read_all;
-
-static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
-{
-	if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
-		atomic_inc(&c->btree_cache.dirty);
-}
-
-static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
-{
-	if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
-		atomic_dec(&c->btree_cache.dirty);
-}
-
-static inline unsigned btree_ptr_sectors_written(struct bkey_i *k)
-{
-	return k->k.type == KEY_TYPE_btree_ptr_v2
-		? le16_to_cpu(bkey_i_to_btree_ptr_v2(k)->v.sectors_written)
-		: 0;
-}
-
-struct btree_read_bio {
-	struct bch_fs		*c;
-	struct btree		*b;
-	struct btree_node_read_all *ra;
-	u64			start_time;
-	unsigned		have_ioref:1;
-	unsigned		idx:7;
-	struct extent_ptr_decoded	pick;
-	struct work_struct	work;
-	struct bio		bio;
-};
-
-struct btree_write_bio {
-	struct work_struct	work;
-	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
-	void			*data;
-	unsigned		data_bytes;
-	unsigned		sector_offset;
-	struct bch_write_bio	wbio;
-};
-
-void bch2_btree_node_io_unlock(struct btree *);
-void bch2_btree_node_io_lock(struct btree *);
-void __bch2_btree_node_wait_on_read(struct btree *);
-void __bch2_btree_node_wait_on_write(struct btree *);
-void bch2_btree_node_wait_on_read(struct btree *);
-void bch2_btree_node_wait_on_write(struct btree *);
-
-enum compact_mode {
-	COMPACT_LAZY,
-	COMPACT_ALL,
-};
-
-bool bch2_compact_whiteouts(struct bch_fs *, struct btree *,
-			    enum compact_mode);
-
-static inline bool should_compact_bset_lazy(struct btree *b,
-					    struct bset_tree *t)
-{
-	unsigned total_u64s = bset_u64s(t);
-	unsigned dead_u64s = bset_dead_u64s(b, t);
-
-	return dead_u64s > 64 && dead_u64s * 3 > total_u64s;
-}
-
-static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
-{
-	for_each_bset(b, t)
-		if (should_compact_bset_lazy(b, t))
-			return bch2_compact_whiteouts(c, b, COMPACT_LAZY);
-
-	return false;
-}
-
-static inline struct nonce btree_nonce(struct bset *i, unsigned offset)
-{
-	return (struct nonce) {{
-		[0] = cpu_to_le32(offset),
-		[1] = ((__le32 *) &i->seq)[0],
-		[2] = ((__le32 *) &i->seq)[1],
-		[3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
-	}};
-}
-
-static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
-{
-	struct nonce nonce = btree_nonce(i, offset);
-	int ret;
-
-	if (!offset) {
-		struct btree_node *bn = container_of(i, struct btree_node, keys);
-		unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
-
-		ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
-				   &bn->flags, bytes);
-		if (ret)
-			return ret;
-
-		nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
-	}
-
-	return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
-			    vstruct_end(i) - (void *) i->_data);
-}
-
-void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
-
-void bch2_btree_node_drop_keys_outside_node(struct btree *);
-
-void bch2_btree_build_aux_trees(struct btree *);
-void bch2_btree_init_next(struct btree_trans *, struct btree *);
-
-int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
-			      struct btree *, bool, bool *);
-void bch2_btree_node_read(struct btree_trans *, struct btree *, bool);
-int bch2_btree_root_read(struct bch_fs *, enum btree_id,
-			 const struct bkey_i *, unsigned);
-
-bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
-
-enum btree_write_flags {
-	__BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS,
-	__BTREE_WRITE_ALREADY_STARTED,
-};
-#define BTREE_WRITE_ONLY_IF_NEED	BIT(__BTREE_WRITE_ONLY_IF_NEED)
-#define BTREE_WRITE_ALREADY_STARTED	BIT(__BTREE_WRITE_ALREADY_STARTED)
-
-void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
-void bch2_btree_node_write(struct bch_fs *, struct btree *,
-			   enum six_lock_type, unsigned);
-
-static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
-					    enum six_lock_type lock_held)
-{
-	bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
-}
-
-bool bch2_btree_flush_all_reads(struct bch_fs *);
-bool bch2_btree_flush_all_writes(struct bch_fs *);
-
-static inline void compat_bformat(unsigned level, enum btree_id btree_id,
-				  unsigned version, unsigned big_endian,
-				  int write, struct bkey_format *f)
-{
-	if (version < bcachefs_metadata_version_inode_btree_change &&
-	    btree_id == BTREE_ID_inodes) {
-		swap(f->bits_per_field[BKEY_FIELD_INODE],
-		     f->bits_per_field[BKEY_FIELD_OFFSET]);
-		swap(f->field_offset[BKEY_FIELD_INODE],
-		     f->field_offset[BKEY_FIELD_OFFSET]);
-	}
-
-	if (version < bcachefs_metadata_version_snapshot &&
-	    (level || btree_type_has_snapshots(btree_id))) {
-		u64 max_packed =
-			~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
-
-		f->field_offset[BKEY_FIELD_SNAPSHOT] = write
-			? 0
-			: cpu_to_le64(U32_MAX - max_packed);
-	}
-}
-
-static inline void compat_bpos(unsigned level, enum btree_id btree_id,
-			       unsigned version, unsigned big_endian,
-			       int write, struct bpos *p)
-{
-	if (big_endian != CPU_BIG_ENDIAN)
-		bch2_bpos_swab(p);
-
-	if (version < bcachefs_metadata_version_inode_btree_change &&
-	    btree_id == BTREE_ID_inodes)
-		swap(p->inode, p->offset);
-}
-
-static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
-				     unsigned version, unsigned big_endian,
-				     int write,
-				     struct btree_node *bn)
-{
-	if (version < bcachefs_metadata_version_inode_btree_change &&
-	    btree_id_is_extents(btree_id) &&
-	    !bpos_eq(bn->min_key, POS_MIN) &&
-	    write)
-		bn->min_key = bpos_nosnap_predecessor(bn->min_key);
-
-	if (version < bcachefs_metadata_version_snapshot &&
-	    write)
-		bn->max_key.snapshot = 0;
-
-	compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key);
-	compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key);
-
-	if (version < bcachefs_metadata_version_snapshot &&
-	    !write)
-		bn->max_key.snapshot = U32_MAX;
-
-	if (version < bcachefs_metadata_version_inode_btree_change &&
-	    btree_id_is_extents(btree_id) &&
-	    !bpos_eq(bn->min_key, POS_MIN) &&
-	    !write)
-		bn->min_key = bpos_nosnap_successor(bn->min_key);
-}
-
-void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_IO_H */
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
deleted file mode 100644
index 5bf98cb8b15d..000000000000
--- a/fs/bcachefs/btree_iter.c
+++ /dev/null
@@ -1,3445 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_methods.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "btree_journal_iter.h"
-#include "btree_key_cache.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "debug.h"
-#include "error.h"
-#include "extents.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "replicas.h"
-#include "snapshot.h"
-#include "trace.h"
-
-#include <linux/random.h>
-#include <linux/prefetch.h>
-
-static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
-static inline void btree_path_list_add(struct btree_trans *,
-			btree_path_idx_t, btree_path_idx_t);
-
-static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
-{
-#ifdef TRACK_PATH_ALLOCATED
-	return iter->ip_allocated;
-#else
-	return 0;
-#endif
-}
-
-static btree_path_idx_t btree_path_alloc(struct btree_trans *, btree_path_idx_t);
-static void bch2_trans_srcu_lock(struct btree_trans *);
-
-static inline int __btree_path_cmp(const struct btree_path *l,
-				   enum btree_id	r_btree_id,
-				   bool			r_cached,
-				   struct bpos		r_pos,
-				   unsigned		r_level)
-{
-	/*
-	 * Must match lock ordering as defined by __bch2_btree_node_lock:
-	 */
-	return   cmp_int(l->btree_id,	r_btree_id) ?:
-		 cmp_int((int) l->cached,	(int) r_cached) ?:
-		 bpos_cmp(l->pos,	r_pos) ?:
-		-cmp_int(l->level,	r_level);
-}
-
-static inline int btree_path_cmp(const struct btree_path *l,
-				 const struct btree_path *r)
-{
-	return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
-}
-
-static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
-{
-	/* Are we iterating over keys in all snapshots? */
-	if (iter->flags & BTREE_ITER_all_snapshots) {
-		p = bpos_successor(p);
-	} else {
-		p = bpos_nosnap_successor(p);
-		p.snapshot = iter->snapshot;
-	}
-
-	return p;
-}
-
-static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
-{
-	/* Are we iterating over keys in all snapshots? */
-	if (iter->flags & BTREE_ITER_all_snapshots) {
-		p = bpos_predecessor(p);
-	} else {
-		p = bpos_nosnap_predecessor(p);
-		p.snapshot = iter->snapshot;
-	}
-
-	return p;
-}
-
-static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
-{
-	struct bpos pos = iter->pos;
-
-	if ((iter->flags & BTREE_ITER_is_extents) &&
-	    !bkey_eq(pos, POS_MAX))
-		pos = bkey_successor(iter, pos);
-	return pos;
-}
-
-static inline bool btree_path_pos_before_node(struct btree_path *path,
-					      struct btree *b)
-{
-	return bpos_lt(path->pos, b->data->min_key);
-}
-
-static inline bool btree_path_pos_after_node(struct btree_path *path,
-					     struct btree *b)
-{
-	return bpos_gt(path->pos, b->key.k.p);
-}
-
-static inline bool btree_path_pos_in_node(struct btree_path *path,
-					  struct btree *b)
-{
-	return path->btree_id == b->c.btree_id &&
-		!btree_path_pos_before_node(path, b) &&
-		!btree_path_pos_after_node(path, b);
-}
-
-/* Btree iterator: */
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-static void bch2_btree_path_verify_cached(struct btree_trans *trans,
-					  struct btree_path *path)
-{
-	struct bkey_cached *ck;
-	bool locked = btree_node_locked(path, 0);
-
-	if (!bch2_btree_node_relock(trans, path, 0))
-		return;
-
-	ck = (void *) path->l[0].b;
-	BUG_ON(ck->key.btree_id != path->btree_id ||
-	       !bkey_eq(ck->key.pos, path->pos));
-
-	if (!locked)
-		btree_node_unlock(trans, path, 0);
-}
-
-static void bch2_btree_path_verify_level(struct btree_trans *trans,
-				struct btree_path *path, unsigned level)
-{
-	struct btree_path_level *l;
-	struct btree_node_iter tmp;
-	bool locked;
-	struct bkey_packed *p, *k;
-	struct printbuf buf1 = PRINTBUF;
-	struct printbuf buf2 = PRINTBUF;
-	struct printbuf buf3 = PRINTBUF;
-	const char *msg;
-
-	if (!bch2_debug_check_iterators)
-		return;
-
-	l	= &path->l[level];
-	tmp	= l->iter;
-	locked	= btree_node_locked(path, level);
-
-	if (path->cached) {
-		if (!level)
-			bch2_btree_path_verify_cached(trans, path);
-		return;
-	}
-
-	if (!btree_path_node(path, level))
-		return;
-
-	if (!bch2_btree_node_relock_notrace(trans, path, level))
-		return;
-
-	BUG_ON(!btree_path_pos_in_node(path, l->b));
-
-	bch2_btree_node_iter_verify(&l->iter, l->b);
-
-	/*
-	 * For interior nodes, the iterator will have skipped past deleted keys:
-	 */
-	p = level
-		? bch2_btree_node_iter_prev(&tmp, l->b)
-		: bch2_btree_node_iter_prev_all(&tmp, l->b);
-	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
-
-	if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
-		msg = "before";
-		goto err;
-	}
-
-	if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
-		msg = "after";
-		goto err;
-	}
-
-	if (!locked)
-		btree_node_unlock(trans, path, level);
-	return;
-err:
-	bch2_bpos_to_text(&buf1, path->pos);
-
-	if (p) {
-		struct bkey uk = bkey_unpack_key(l->b, p);
-
-		bch2_bkey_to_text(&buf2, &uk);
-	} else {
-		prt_printf(&buf2, "(none)");
-	}
-
-	if (k) {
-		struct bkey uk = bkey_unpack_key(l->b, k);
-
-		bch2_bkey_to_text(&buf3, &uk);
-	} else {
-		prt_printf(&buf3, "(none)");
-	}
-
-	panic("path should be %s key at level %u:\n"
-	      "path pos %s\n"
-	      "prev key %s\n"
-	      "cur  key %s\n",
-	      msg, level, buf1.buf, buf2.buf, buf3.buf);
-}
-
-static void bch2_btree_path_verify(struct btree_trans *trans,
-				   struct btree_path *path)
-{
-	struct bch_fs *c = trans->c;
-	unsigned i;
-
-	EBUG_ON(path->btree_id >= BTREE_ID_NR);
-
-	for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
-		if (!path->l[i].b) {
-			BUG_ON(!path->cached &&
-			       bch2_btree_id_root(c, path->btree_id)->b->c.level > i);
-			break;
-		}
-
-		bch2_btree_path_verify_level(trans, path, i);
-	}
-
-	bch2_btree_path_verify_locks(path);
-}
-
-void bch2_trans_verify_paths(struct btree_trans *trans)
-{
-	struct btree_path *path;
-	unsigned iter;
-
-	trans_for_each_path(trans, path, iter)
-		bch2_btree_path_verify(trans, path);
-}
-
-static void bch2_btree_iter_verify(struct btree_iter *iter)
-{
-	struct btree_trans *trans = iter->trans;
-
-	BUG_ON(iter->btree_id >= BTREE_ID_NR);
-
-	BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached);
-
-	BUG_ON((iter->flags & BTREE_ITER_is_extents) &&
-	       (iter->flags & BTREE_ITER_all_snapshots));
-
-	BUG_ON(!(iter->flags & BTREE_ITER_snapshot_field) &&
-	       (iter->flags & BTREE_ITER_all_snapshots) &&
-	       !btree_type_has_snapshot_field(iter->btree_id));
-
-	if (iter->update_path)
-		bch2_btree_path_verify(trans, &trans->paths[iter->update_path]);
-	bch2_btree_path_verify(trans, btree_iter_path(trans, iter));
-}
-
-static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
-{
-	BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) &&
-	       !iter->pos.snapshot);
-
-	BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) &&
-	       iter->pos.snapshot != iter->snapshot);
-
-	BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
-	       bkey_gt(iter->pos, iter->k.p));
-}
-
-static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
-{
-	struct btree_trans *trans = iter->trans;
-	struct btree_iter copy;
-	struct bkey_s_c prev;
-	int ret = 0;
-
-	if (!bch2_debug_check_iterators)
-		return 0;
-
-	if (!(iter->flags & BTREE_ITER_filter_snapshots))
-		return 0;
-
-	if (bkey_err(k) || !k.k)
-		return 0;
-
-	BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
-					  iter->snapshot,
-					  k.k->p.snapshot));
-
-	bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
-			     BTREE_ITER_nopreserve|
-			     BTREE_ITER_all_snapshots);
-	prev = bch2_btree_iter_prev(&copy);
-	if (!prev.k)
-		goto out;
-
-	ret = bkey_err(prev);
-	if (ret)
-		goto out;
-
-	if (bkey_eq(prev.k->p, k.k->p) &&
-	    bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
-				      prev.k->p.snapshot) > 0) {
-		struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
-
-		bch2_bkey_to_text(&buf1, k.k);
-		bch2_bkey_to_text(&buf2, prev.k);
-
-		panic("iter snap %u\n"
-		      "k    %s\n"
-		      "prev %s\n",
-		      iter->snapshot,
-		      buf1.buf, buf2.buf);
-	}
-out:
-	bch2_trans_iter_exit(trans, &copy);
-	return ret;
-}
-
-void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
-			    struct bpos pos, bool key_cache)
-{
-	struct btree_path *path;
-	struct trans_for_each_path_inorder_iter iter;
-	struct printbuf buf = PRINTBUF;
-
-	btree_trans_sort_paths(trans);
-
-	trans_for_each_path_inorder(trans, path, iter) {
-		int cmp = cmp_int(path->btree_id, id) ?:
-			cmp_int(path->cached, key_cache);
-
-		if (cmp > 0)
-			break;
-		if (cmp < 0)
-			continue;
-
-		if (!btree_node_locked(path, 0) ||
-		    !path->should_be_locked)
-			continue;
-
-		if (!key_cache) {
-			if (bkey_ge(pos, path->l[0].b->data->min_key) &&
-			    bkey_le(pos, path->l[0].b->key.k.p))
-				return;
-		} else {
-			if (bkey_eq(pos, path->pos))
-				return;
-		}
-	}
-
-	bch2_dump_trans_paths_updates(trans);
-	bch2_bpos_to_text(&buf, pos);
-
-	panic("not locked: %s %s%s\n",
-	      bch2_btree_id_str(id), buf.buf,
-	      key_cache ? " cached" : "");
-}
-
-#else
-
-static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
-						struct btree_path *path, unsigned l) {}
-static inline void bch2_btree_path_verify(struct btree_trans *trans,
-					  struct btree_path *path) {}
-static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
-static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
-static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
-
-#endif
-
-/* Btree path: fixups after btree updates */
-
-static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
-					struct btree *b,
-					struct bset_tree *t,
-					struct bkey_packed *k)
-{
-	struct btree_node_iter_set *set;
-
-	btree_node_iter_for_each(iter, set)
-		if (set->end == t->end_offset) {
-			set->k = __btree_node_key_to_offset(b, k);
-			bch2_btree_node_iter_sort(iter, b);
-			return;
-		}
-
-	bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
-}
-
-static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
-					       struct btree *b,
-					       struct bkey_packed *where)
-{
-	struct btree_path_level *l = &path->l[b->c.level];
-
-	if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
-		return;
-
-	if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
-		bch2_btree_node_iter_advance(&l->iter, l->b);
-}
-
-void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
-				      struct btree *b,
-				      struct bkey_packed *where)
-{
-	struct btree_path *path;
-	unsigned i;
-
-	trans_for_each_path_with_node(trans, b, path, i) {
-		__bch2_btree_path_fix_key_modified(path, b, where);
-		bch2_btree_path_verify_level(trans, path, b->c.level);
-	}
-}
-
-static void __bch2_btree_node_iter_fix(struct btree_path *path,
-				       struct btree *b,
-				       struct btree_node_iter *node_iter,
-				       struct bset_tree *t,
-				       struct bkey_packed *where,
-				       unsigned clobber_u64s,
-				       unsigned new_u64s)
-{
-	const struct bkey_packed *end = btree_bkey_last(b, t);
-	struct btree_node_iter_set *set;
-	unsigned offset = __btree_node_key_to_offset(b, where);
-	int shift = new_u64s - clobber_u64s;
-	unsigned old_end = t->end_offset - shift;
-	unsigned orig_iter_pos = node_iter->data[0].k;
-	bool iter_current_key_modified =
-		orig_iter_pos >= offset &&
-		orig_iter_pos <= offset + clobber_u64s;
-
-	btree_node_iter_for_each(node_iter, set)
-		if (set->end == old_end)
-			goto found;
-
-	/* didn't find the bset in the iterator - might have to readd it: */
-	if (new_u64s &&
-	    bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
-		bch2_btree_node_iter_push(node_iter, b, where, end);
-		goto fixup_done;
-	} else {
-		/* Iterator is after key that changed */
-		return;
-	}
-found:
-	set->end = t->end_offset;
-
-	/* Iterator hasn't gotten to the key that changed yet: */
-	if (set->k < offset)
-		return;
-
-	if (new_u64s &&
-	    bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
-		set->k = offset;
-	} else if (set->k < offset + clobber_u64s) {
-		set->k = offset + new_u64s;
-		if (set->k == set->end)
-			bch2_btree_node_iter_set_drop(node_iter, set);
-	} else {
-		/* Iterator is after key that changed */
-		set->k = (int) set->k + shift;
-		return;
-	}
-
-	bch2_btree_node_iter_sort(node_iter, b);
-fixup_done:
-	if (node_iter->data[0].k != orig_iter_pos)
-		iter_current_key_modified = true;
-
-	/*
-	 * When a new key is added, and the node iterator now points to that
-	 * key, the iterator might have skipped past deleted keys that should
-	 * come after the key the iterator now points to. We have to rewind to
-	 * before those deleted keys - otherwise
-	 * bch2_btree_node_iter_prev_all() breaks:
-	 */
-	if (!bch2_btree_node_iter_end(node_iter) &&
-	    iter_current_key_modified &&
-	    b->c.level) {
-		struct bkey_packed *k, *k2, *p;
-
-		k = bch2_btree_node_iter_peek_all(node_iter, b);
-
-		for_each_bset(b, t) {
-			bool set_pos = false;
-
-			if (node_iter->data[0].end == t->end_offset)
-				continue;
-
-			k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
-
-			while ((p = bch2_bkey_prev_all(b, t, k2)) &&
-			       bkey_iter_cmp(b, k, p) < 0) {
-				k2 = p;
-				set_pos = true;
-			}
-
-			if (set_pos)
-				btree_node_iter_set_set_pos(node_iter,
-							    b, t, k2);
-		}
-	}
-}
-
-void bch2_btree_node_iter_fix(struct btree_trans *trans,
-			      struct btree_path *path,
-			      struct btree *b,
-			      struct btree_node_iter *node_iter,
-			      struct bkey_packed *where,
-			      unsigned clobber_u64s,
-			      unsigned new_u64s)
-{
-	struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
-	struct btree_path *linked;
-	unsigned i;
-
-	if (node_iter != &path->l[b->c.level].iter) {
-		__bch2_btree_node_iter_fix(path, b, node_iter, t,
-					   where, clobber_u64s, new_u64s);
-
-		if (bch2_debug_check_iterators)
-			bch2_btree_node_iter_verify(node_iter, b);
-	}
-
-	trans_for_each_path_with_node(trans, b, linked, i) {
-		__bch2_btree_node_iter_fix(linked, b,
-					   &linked->l[b->c.level].iter, t,
-					   where, clobber_u64s, new_u64s);
-		bch2_btree_path_verify_level(trans, linked, b->c.level);
-	}
-}
-
-/* Btree path level: pointer to a particular btree node and node iter */
-
-static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
-						  struct btree_path_level *l,
-						  struct bkey *u,
-						  struct bkey_packed *k)
-{
-	if (unlikely(!k)) {
-		/*
-		 * signal to bch2_btree_iter_peek_slot() that we're currently at
-		 * a hole
-		 */
-		u->type = KEY_TYPE_deleted;
-		return bkey_s_c_null;
-	}
-
-	return bkey_disassemble(l->b, k, u);
-}
-
-static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
-							struct btree_path_level *l,
-							struct bkey *u)
-{
-	return __btree_iter_unpack(c, l, u,
-			bch2_btree_node_iter_peek_all(&l->iter, l->b));
-}
-
-static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
-						    struct btree_path *path,
-						    struct btree_path_level *l,
-						    struct bkey *u)
-{
-	struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
-			bch2_btree_node_iter_peek(&l->iter, l->b));
-
-	path->pos = k.k ? k.k->p : l->b->key.k.p;
-	trans->paths_sorted = false;
-	bch2_btree_path_verify_level(trans, path, l - path->l);
-	return k;
-}
-
-static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
-						    struct btree_path *path,
-						    struct btree_path_level *l,
-						    struct bkey *u)
-{
-	struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
-			bch2_btree_node_iter_prev(&l->iter, l->b));
-
-	path->pos = k.k ? k.k->p : l->b->data->min_key;
-	trans->paths_sorted = false;
-	bch2_btree_path_verify_level(trans, path, l - path->l);
-	return k;
-}
-
-static inline bool btree_path_advance_to_pos(struct btree_path *path,
-					     struct btree_path_level *l,
-					     int max_advance)
-{
-	struct bkey_packed *k;
-	int nr_advanced = 0;
-
-	while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
-	       bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
-		if (max_advance > 0 && nr_advanced >= max_advance)
-			return false;
-
-		bch2_btree_node_iter_advance(&l->iter, l->b);
-		nr_advanced++;
-	}
-
-	return true;
-}
-
-static inline void __btree_path_level_init(struct btree_path *path,
-					   unsigned level)
-{
-	struct btree_path_level *l = &path->l[level];
-
-	bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
-
-	/*
-	 * Iterators to interior nodes should always be pointed at the first non
-	 * whiteout:
-	 */
-	if (level)
-		bch2_btree_node_iter_peek(&l->iter, l->b);
-}
-
-void bch2_btree_path_level_init(struct btree_trans *trans,
-				struct btree_path *path,
-				struct btree *b)
-{
-	BUG_ON(path->cached);
-
-	EBUG_ON(!btree_path_pos_in_node(path, b));
-
-	path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
-	path->l[b->c.level].b = b;
-	__btree_path_level_init(path, b->c.level);
-}
-
-/* Btree path: fixups after btree node updates: */
-
-static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b)
-{
-	struct bch_fs *c = trans->c;
-
-	trans_for_each_update(trans, i)
-		if (!i->cached &&
-		    i->level	== b->c.level &&
-		    i->btree_id	== b->c.btree_id &&
-		    bpos_cmp(i->k->k.p, b->data->min_key) >= 0 &&
-		    bpos_cmp(i->k->k.p, b->data->max_key) <= 0) {
-			i->old_v = bch2_btree_path_peek_slot(trans->paths + i->path, &i->old_k).v;
-
-			if (unlikely(trans->journal_replay_not_finished)) {
-				struct bkey_i *j_k =
-					bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
-								    i->k->k.p);
-
-				if (j_k) {
-					i->old_k = j_k->k;
-					i->old_v = &j_k->v;
-				}
-			}
-		}
-}
-
-/*
- * A btree node is being replaced - update the iterator to point to the new
- * node:
- */
-void bch2_trans_node_add(struct btree_trans *trans,
-			 struct btree_path *path,
-			 struct btree *b)
-{
-	struct btree_path *prev;
-
-	BUG_ON(!btree_path_pos_in_node(path, b));
-
-	while ((prev = prev_btree_path(trans, path)) &&
-	       btree_path_pos_in_node(prev, b))
-		path = prev;
-
-	for (;
-	     path && btree_path_pos_in_node(path, b);
-	     path = next_btree_path(trans, path))
-		if (path->uptodate == BTREE_ITER_UPTODATE && !path->cached) {
-			enum btree_node_locked_type t =
-				btree_lock_want(path, b->c.level);
-
-			if (t != BTREE_NODE_UNLOCKED) {
-				btree_node_unlock(trans, path, b->c.level);
-				six_lock_increment(&b->c.lock, (enum six_lock_type) t);
-				mark_btree_node_locked(trans, path, b->c.level, t);
-			}
-
-			bch2_btree_path_level_init(trans, path, b);
-		}
-
-	bch2_trans_revalidate_updates_in_node(trans, b);
-}
-
-/*
- * A btree node has been modified in such a way as to invalidate iterators - fix
- * them:
- */
-void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
-{
-	struct btree_path *path;
-	unsigned i;
-
-	trans_for_each_path_with_node(trans, b, path, i)
-		__btree_path_level_init(path, b->c.level);
-
-	bch2_trans_revalidate_updates_in_node(trans, b);
-}
-
-/* Btree path: traverse, set_pos: */
-
-static inline int btree_path_lock_root(struct btree_trans *trans,
-				       struct btree_path *path,
-				       unsigned depth_want,
-				       unsigned long trace_ip)
-{
-	struct bch_fs *c = trans->c;
-	struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b;
-	enum six_lock_type lock_type;
-	unsigned i;
-	int ret;
-
-	EBUG_ON(path->nodes_locked);
-
-	while (1) {
-		b = READ_ONCE(*rootp);
-		path->level = READ_ONCE(b->c.level);
-
-		if (unlikely(path->level < depth_want)) {
-			/*
-			 * the root is at a lower depth than the depth we want:
-			 * got to the end of the btree, or we're walking nodes
-			 * greater than some depth and there are no nodes >=
-			 * that depth
-			 */
-			path->level = depth_want;
-			for (i = path->level; i < BTREE_MAX_DEPTH; i++)
-				path->l[i].b = NULL;
-			return 1;
-		}
-
-		lock_type = __btree_lock_want(path, path->level);
-		ret = btree_node_lock(trans, path, &b->c,
-				      path->level, lock_type, trace_ip);
-		if (unlikely(ret)) {
-			if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
-				continue;
-			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-				return ret;
-			BUG();
-		}
-
-		if (likely(b == READ_ONCE(*rootp) &&
-			   b->c.level == path->level &&
-			   !race_fault())) {
-			for (i = 0; i < path->level; i++)
-				path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root);
-			path->l[path->level].b = b;
-			for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
-				path->l[i].b = NULL;
-
-			mark_btree_node_locked(trans, path, path->level,
-					       (enum btree_node_locked_type) lock_type);
-			bch2_btree_path_level_init(trans, path, b);
-			return 0;
-		}
-
-		six_unlock_type(&b->c.lock, lock_type);
-	}
-}
-
-noinline
-static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_path_level *l = path_l(path);
-	struct btree_node_iter node_iter = l->iter;
-	struct bkey_packed *k;
-	struct bkey_buf tmp;
-	unsigned nr = test_bit(BCH_FS_started, &c->flags)
-		? (path->level > 1 ? 0 :  2)
-		: (path->level > 1 ? 1 : 16);
-	bool was_locked = btree_node_locked(path, path->level);
-	int ret = 0;
-
-	bch2_bkey_buf_init(&tmp);
-
-	while (nr-- && !ret) {
-		if (!bch2_btree_node_relock(trans, path, path->level))
-			break;
-
-		bch2_btree_node_iter_advance(&node_iter, l->b);
-		k = bch2_btree_node_iter_peek(&node_iter, l->b);
-		if (!k)
-			break;
-
-		bch2_bkey_buf_unpack(&tmp, c, l->b, k);
-		ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
-					       path->level - 1);
-	}
-
-	if (!was_locked)
-		btree_node_unlock(trans, path, path->level);
-
-	bch2_bkey_buf_exit(&tmp, c);
-	return ret;
-}
-
-static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
-				 struct btree_and_journal_iter *jiter)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_s_c k;
-	struct bkey_buf tmp;
-	unsigned nr = test_bit(BCH_FS_started, &c->flags)
-		? (path->level > 1 ? 0 :  2)
-		: (path->level > 1 ? 1 : 16);
-	bool was_locked = btree_node_locked(path, path->level);
-	int ret = 0;
-
-	bch2_bkey_buf_init(&tmp);
-
-	while (nr-- && !ret) {
-		if (!bch2_btree_node_relock(trans, path, path->level))
-			break;
-
-		bch2_btree_and_journal_iter_advance(jiter);
-		k = bch2_btree_and_journal_iter_peek(jiter);
-		if (!k.k)
-			break;
-
-		bch2_bkey_buf_reassemble(&tmp, c, k);
-		ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
-					       path->level - 1);
-	}
-
-	if (!was_locked)
-		btree_node_unlock(trans, path, path->level);
-
-	bch2_bkey_buf_exit(&tmp, c);
-	return ret;
-}
-
-static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
-					    struct btree_path *path,
-					    unsigned plevel, struct btree *b)
-{
-	struct btree_path_level *l = &path->l[plevel];
-	bool locked = btree_node_locked(path, plevel);
-	struct bkey_packed *k;
-	struct bch_btree_ptr_v2 *bp;
-
-	if (!bch2_btree_node_relock(trans, path, plevel))
-		return;
-
-	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
-	BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
-
-	bp = (void *) bkeyp_val(&l->b->format, k);
-	bp->mem_ptr = (unsigned long)b;
-
-	if (!locked)
-		btree_node_unlock(trans, path, plevel);
-}
-
-static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
-						     struct btree_path *path,
-						     unsigned flags,
-						     struct bkey_buf *out)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_path_level *l = path_l(path);
-	struct btree_and_journal_iter jiter;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	__bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos);
-
-	k = bch2_btree_and_journal_iter_peek(&jiter);
-
-	bch2_bkey_buf_reassemble(out, c, k);
-
-	if ((flags & BTREE_ITER_prefetch) &&
-	    c->opts.btree_node_prefetch)
-		ret = btree_path_prefetch_j(trans, path, &jiter);
-
-	bch2_btree_and_journal_iter_exit(&jiter);
-	return ret;
-}
-
-static __always_inline int btree_path_down(struct btree_trans *trans,
-					   struct btree_path *path,
-					   unsigned flags,
-					   unsigned long trace_ip)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_path_level *l = path_l(path);
-	struct btree *b;
-	unsigned level = path->level - 1;
-	enum six_lock_type lock_type = __btree_lock_want(path, level);
-	struct bkey_buf tmp;
-	int ret;
-
-	EBUG_ON(!btree_node_locked(path, path->level));
-
-	bch2_bkey_buf_init(&tmp);
-
-	if (unlikely(trans->journal_replay_not_finished)) {
-		ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
-		if (ret)
-			goto err;
-	} else {
-		struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b);
-		if (!k) {
-			struct printbuf buf = PRINTBUF;
-
-			prt_str(&buf, "node not found at pos ");
-			bch2_bpos_to_text(&buf, path->pos);
-			prt_str(&buf, " within parent node ");
-			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key));
-
-			bch2_fs_fatal_error(c, "%s", buf.buf);
-			printbuf_exit(&buf);
-			ret = -BCH_ERR_btree_need_topology_repair;
-			goto err;
-		}
-
-		bch2_bkey_buf_unpack(&tmp, c, l->b, k);
-
-		if ((flags & BTREE_ITER_prefetch) &&
-		    c->opts.btree_node_prefetch) {
-			ret = btree_path_prefetch(trans, path);
-			if (ret)
-				goto err;
-		}
-	}
-
-	b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
-	ret = PTR_ERR_OR_ZERO(b);
-	if (unlikely(ret))
-		goto err;
-
-	if (likely(!trans->journal_replay_not_finished &&
-		   tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
-	    unlikely(b != btree_node_mem_ptr(tmp.k)))
-		btree_node_mem_ptr_set(trans, path, level + 1, b);
-
-	if (btree_node_read_locked(path, level + 1))
-		btree_node_unlock(trans, path, level + 1);
-
-	mark_btree_node_locked(trans, path, level,
-			       (enum btree_node_locked_type) lock_type);
-	path->level = level;
-	bch2_btree_path_level_init(trans, path, b);
-
-	bch2_btree_path_verify_locks(path);
-err:
-	bch2_bkey_buf_exit(&tmp, c);
-	return ret;
-}
-
-static int bch2_btree_path_traverse_all(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_path *path;
-	unsigned long trace_ip = _RET_IP_;
-	unsigned i;
-	int ret = 0;
-
-	if (trans->in_traverse_all)
-		return -BCH_ERR_transaction_restart_in_traverse_all;
-
-	trans->in_traverse_all = true;
-retry_all:
-	trans->restarted = 0;
-	trans->last_restarted_ip = 0;
-
-	trans_for_each_path(trans, path, i)
-		path->should_be_locked = false;
-
-	btree_trans_sort_paths(trans);
-
-	bch2_trans_unlock(trans);
-	cond_resched();
-	trans->locked = true;
-
-	if (unlikely(trans->memory_allocation_failure)) {
-		struct closure cl;
-
-		closure_init_stack(&cl);
-
-		do {
-			ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
-			closure_sync(&cl);
-		} while (ret);
-	}
-
-	/* Now, redo traversals in correct order: */
-	i = 0;
-	while (i < trans->nr_sorted) {
-		btree_path_idx_t idx = trans->sorted[i];
-
-		/*
-		 * Traversing a path can cause another path to be added at about
-		 * the same position:
-		 */
-		if (trans->paths[idx].uptodate) {
-			__btree_path_get(&trans->paths[idx], false);
-			ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_);
-			__btree_path_put(&trans->paths[idx], false);
-
-			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
-			    bch2_err_matches(ret, ENOMEM))
-				goto retry_all;
-			if (ret)
-				goto err;
-		} else {
-			i++;
-		}
-	}
-
-	/*
-	 * We used to assert that all paths had been traversed here
-	 * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since
-	 * path->should_be_locked is not set yet, we might have unlocked and
-	 * then failed to relock a path - that's fine.
-	 */
-err:
-	bch2_btree_cache_cannibalize_unlock(trans);
-
-	trans->in_traverse_all = false;
-
-	trace_and_count(c, trans_traverse_all, trans, trace_ip);
-	return ret;
-}
-
-static inline bool btree_path_check_pos_in_node(struct btree_path *path,
-						unsigned l, int check_pos)
-{
-	if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
-		return false;
-	if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
-		return false;
-	return true;
-}
-
-static inline bool btree_path_good_node(struct btree_trans *trans,
-					struct btree_path *path,
-					unsigned l, int check_pos)
-{
-	return is_btree_node(path, l) &&
-		bch2_btree_node_relock(trans, path, l) &&
-		btree_path_check_pos_in_node(path, l, check_pos);
-}
-
-static void btree_path_set_level_down(struct btree_trans *trans,
-				      struct btree_path *path,
-				      unsigned new_level)
-{
-	unsigned l;
-
-	path->level = new_level;
-
-	for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
-		if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
-			btree_node_unlock(trans, path, l);
-
-	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-	bch2_btree_path_verify(trans, path);
-}
-
-static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans,
-							 struct btree_path *path,
-							 int check_pos)
-{
-	unsigned i, l = path->level;
-again:
-	while (btree_path_node(path, l) &&
-	       !btree_path_good_node(trans, path, l, check_pos))
-		__btree_path_set_level_up(trans, path, l++);
-
-	/* If we need intent locks, take them too: */
-	for (i = l + 1;
-	     i < path->locks_want && btree_path_node(path, i);
-	     i++)
-		if (!bch2_btree_node_relock(trans, path, i)) {
-			while (l <= i)
-				__btree_path_set_level_up(trans, path, l++);
-			goto again;
-		}
-
-	return l;
-}
-
-static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
-						     struct btree_path *path,
-						     int check_pos)
-{
-	return likely(btree_node_locked(path, path->level) &&
-		      btree_path_check_pos_in_node(path, path->level, check_pos))
-		? path->level
-		: __btree_path_up_until_good_node(trans, path, check_pos);
-}
-
-/*
- * This is the main state machine for walking down the btree - walks down to a
- * specified depth
- *
- * Returns 0 on success, -EIO on error (error reading in a btree node).
- *
- * On error, caller (peek_node()/peek_key()) must return NULL; the error is
- * stashed in the iterator and returned from bch2_trans_exit().
- */
-int bch2_btree_path_traverse_one(struct btree_trans *trans,
-				 btree_path_idx_t path_idx,
-				 unsigned flags,
-				 unsigned long trace_ip)
-{
-	struct btree_path *path = &trans->paths[path_idx];
-	unsigned depth_want = path->level;
-	int ret = -((int) trans->restarted);
-
-	if (unlikely(ret))
-		goto out;
-
-	if (unlikely(!trans->srcu_held))
-		bch2_trans_srcu_lock(trans);
-
-	/*
-	 * Ensure we obey path->should_be_locked: if it's set, we can't unlock
-	 * and re-traverse the path without a transaction restart:
-	 */
-	if (path->should_be_locked) {
-		ret = bch2_btree_path_relock(trans, path, trace_ip);
-		goto out;
-	}
-
-	if (path->cached) {
-		ret = bch2_btree_path_traverse_cached(trans, path, flags);
-		goto out;
-	}
-
-	path = &trans->paths[path_idx];
-
-	if (unlikely(path->level >= BTREE_MAX_DEPTH))
-		goto out_uptodate;
-
-	path->level = btree_path_up_until_good_node(trans, path, 0);
-	unsigned max_level = path->level;
-
-	EBUG_ON(btree_path_node(path, path->level) &&
-		!btree_node_locked(path, path->level));
-
-	/*
-	 * Note: path->nodes[path->level] may be temporarily NULL here - that
-	 * would indicate to other code that we got to the end of the btree,
-	 * here it indicates that relocking the root failed - it's critical that
-	 * btree_path_lock_root() comes next and that it can't fail
-	 */
-	while (path->level > depth_want) {
-		ret = btree_path_node(path, path->level)
-			? btree_path_down(trans, path, flags, trace_ip)
-			: btree_path_lock_root(trans, path, depth_want, trace_ip);
-		if (unlikely(ret)) {
-			if (ret == 1) {
-				/*
-				 * No nodes at this level - got to the end of
-				 * the btree:
-				 */
-				ret = 0;
-				goto out;
-			}
-
-			__bch2_btree_path_unlock(trans, path);
-			path->level = depth_want;
-			path->l[path->level].b = ERR_PTR(ret);
-			goto out;
-		}
-	}
-
-	if (unlikely(max_level > path->level)) {
-		struct btree_path *linked;
-		unsigned iter;
-
-		trans_for_each_path_with_node(trans, path_l(path)->b, linked, iter)
-			for (unsigned j = path->level + 1; j < max_level; j++)
-				linked->l[j] = path->l[j];
-	}
-
-out_uptodate:
-	path->uptodate = BTREE_ITER_UPTODATE;
-out:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
-		panic("ret %s (%i) trans->restarted %s (%i)\n",
-		      bch2_err_str(ret), ret,
-		      bch2_err_str(trans->restarted), trans->restarted);
-	bch2_btree_path_verify(trans, path);
-	return ret;
-}
-
-static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
-			    struct btree_path *src)
-{
-	unsigned i, offset = offsetof(struct btree_path, pos);
-
-	memcpy((void *) dst + offset,
-	       (void *) src + offset,
-	       sizeof(struct btree_path) - offset);
-
-	for (i = 0; i < BTREE_MAX_DEPTH; i++) {
-		unsigned t = btree_node_locked_type(dst, i);
-
-		if (t != BTREE_NODE_UNLOCKED)
-			six_lock_increment(&dst->l[i].b->c.lock, t);
-	}
-}
-
-static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src,
-					 bool intent, unsigned long ip)
-{
-	btree_path_idx_t new = btree_path_alloc(trans, src);
-	btree_path_copy(trans, trans->paths + new, trans->paths + src);
-	__btree_path_get(trans->paths + new, intent);
-#ifdef TRACK_PATH_ALLOCATED
-	trans->paths[new].ip_allocated = ip;
-#endif
-	return new;
-}
-
-__flatten
-btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans,
-			btree_path_idx_t path, bool intent, unsigned long ip)
-{
-	__btree_path_put(trans->paths + path, intent);
-	path = btree_path_clone(trans, path, intent, ip);
-	trans->paths[path].preserve = false;
-	return path;
-}
-
-btree_path_idx_t __must_check
-__bch2_btree_path_set_pos(struct btree_trans *trans,
-			  btree_path_idx_t path_idx, struct bpos new_pos,
-			  bool intent, unsigned long ip)
-{
-	int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos);
-
-	bch2_trans_verify_not_in_restart(trans);
-	EBUG_ON(!trans->paths[path_idx].ref);
-
-	path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip);
-
-	struct btree_path *path = trans->paths + path_idx;
-	path->pos		= new_pos;
-	trans->paths_sorted	= false;
-
-	if (unlikely(path->cached)) {
-		btree_node_unlock(trans, path, 0);
-		path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
-		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-		goto out;
-	}
-
-	unsigned level = btree_path_up_until_good_node(trans, path, cmp);
-
-	if (btree_path_node(path, level)) {
-		struct btree_path_level *l = &path->l[level];
-
-		BUG_ON(!btree_node_locked(path, level));
-		/*
-		 * We might have to skip over many keys, or just a few: try
-		 * advancing the node iterator, and if we have to skip over too
-		 * many keys just reinit it (or if we're rewinding, since that
-		 * is expensive).
-		 */
-		if (cmp < 0 ||
-		    !btree_path_advance_to_pos(path, l, 8))
-			bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
-
-		/*
-		 * Iterators to interior nodes should always be pointed at the first non
-		 * whiteout:
-		 */
-		if (unlikely(level))
-			bch2_btree_node_iter_peek(&l->iter, l->b);
-	}
-
-	if (unlikely(level != path->level)) {
-		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-		__bch2_btree_path_unlock(trans, path);
-	}
-out:
-	bch2_btree_path_verify(trans, path);
-	return path_idx;
-}
-
-/* Btree path: main interface: */
-
-static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
-{
-	struct btree_path *sib;
-
-	sib = prev_btree_path(trans, path);
-	if (sib && !btree_path_cmp(sib, path))
-		return sib;
-
-	sib = next_btree_path(trans, path);
-	if (sib && !btree_path_cmp(sib, path))
-		return sib;
-
-	return NULL;
-}
-
-static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
-{
-	struct btree_path *sib;
-
-	sib = prev_btree_path(trans, path);
-	if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
-		return sib;
-
-	sib = next_btree_path(trans, path);
-	if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
-		return sib;
-
-	return NULL;
-}
-
-static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t path)
-{
-	__bch2_btree_path_unlock(trans, trans->paths + path);
-	btree_path_list_remove(trans, trans->paths + path);
-	__clear_bit(path, trans->paths_allocated);
-}
-
-static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_path *path)
-{
-	unsigned l = path->level;
-
-	do {
-		if (!btree_path_node(path, l))
-			break;
-
-		if (!is_btree_node(path, l))
-			return false;
-
-		if (path->l[l].lock_seq != path->l[l].b->c.lock.seq)
-			return false;
-
-		l++;
-	} while (l < path->locks_want);
-
-	return true;
-}
-
-void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent)
-{
-	struct btree_path *path = trans->paths + path_idx, *dup;
-
-	if (!__btree_path_put(path, intent))
-		return;
-
-	dup = path->preserve
-		? have_path_at_pos(trans, path)
-		: have_node_at_pos(trans, path);
-
-	if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
-		return;
-
-	if (path->should_be_locked && !trans->restarted) {
-		if (!dup)
-			return;
-
-		if (!(trans->locked
-		      ? bch2_btree_path_relock_norestart(trans, dup)
-		      : bch2_btree_path_can_relock(trans, dup)))
-			return;
-	}
-
-	if (dup) {
-		dup->preserve		|= path->preserve;
-		dup->should_be_locked	|= path->should_be_locked;
-	}
-
-	__bch2_path_free(trans, path_idx);
-}
-
-static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path,
-				 bool intent)
-{
-	if (!__btree_path_put(trans->paths + path, intent))
-		return;
-
-	__bch2_path_free(trans, path);
-}
-
-void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
-{
-	panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
-	      trans->restart_count, restart_count,
-	      (void *) trans->last_begin_ip);
-}
-
-void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
-{
-	panic("in transaction restart: %s, last restarted by %pS\n",
-	      bch2_err_str(trans->restarted),
-	      (void *) trans->last_restarted_ip);
-}
-
-void __noreturn bch2_trans_unlocked_error(struct btree_trans *trans)
-{
-	panic("trans should be locked, unlocked by %pS\n",
-	      (void *) trans->last_unlock_ip);
-}
-
-noinline __cold
-void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
-{
-	prt_printf(buf, "transaction updates for %s journal seq %llu\n",
-	       trans->fn, trans->journal_res.seq);
-	printbuf_indent_add(buf, 2);
-
-	trans_for_each_update(trans, i) {
-		struct bkey_s_c old = { &i->old_k, i->old_v };
-
-		prt_printf(buf, "update: btree=%s cached=%u %pS\n",
-		       bch2_btree_id_str(i->btree_id),
-		       i->cached,
-		       (void *) i->ip_allocated);
-
-		prt_printf(buf, "  old ");
-		bch2_bkey_val_to_text(buf, trans->c, old);
-		prt_newline(buf);
-
-		prt_printf(buf, "  new ");
-		bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
-		prt_newline(buf);
-	}
-
-	for (struct jset_entry *e = trans->journal_entries;
-	     e != btree_trans_journal_entries_top(trans);
-	     e = vstruct_next(e))
-		bch2_journal_entry_to_text(buf, trans->c, e);
-
-	printbuf_indent_sub(buf, 2);
-}
-
-noinline __cold
-void bch2_dump_trans_updates(struct btree_trans *trans)
-{
-	struct printbuf buf = PRINTBUF;
-
-	bch2_trans_updates_to_text(&buf, trans);
-	bch2_print_string_as_lines(KERN_ERR, buf.buf);
-	printbuf_exit(&buf);
-}
-
-static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
-{
-	struct btree_path *path = trans->paths + path_idx;
-
-	prt_printf(out, "path: idx %2u ref %u:%u %c %c %c btree=%s l=%u pos ",
-		   path_idx, path->ref, path->intent_ref,
-		   path->preserve ? 'P' : ' ',
-		   path->should_be_locked ? 'S' : ' ',
-		   path->cached ? 'C' : 'B',
-		   bch2_btree_id_str(path->btree_id),
-		   path->level);
-	bch2_bpos_to_text(out, path->pos);
-
-#ifdef TRACK_PATH_ALLOCATED
-	prt_printf(out, " %pS", (void *) path->ip_allocated);
-#endif
-}
-
-static const char *btree_node_locked_str(enum btree_node_locked_type t)
-{
-	switch (t) {
-	case BTREE_NODE_UNLOCKED:
-		return "unlocked";
-	case BTREE_NODE_READ_LOCKED:
-		return "read";
-	case BTREE_NODE_INTENT_LOCKED:
-		return "intent";
-	case BTREE_NODE_WRITE_LOCKED:
-		return "write";
-	default:
-		return NULL;
-	}
-}
-
-void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
-{
-	bch2_btree_path_to_text_short(out, trans, path_idx);
-
-	struct btree_path *path = trans->paths + path_idx;
-
-	prt_printf(out, " uptodate %u locks_want %u", path->uptodate, path->locks_want);
-	prt_newline(out);
-
-	printbuf_indent_add(out, 2);
-	for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) {
-		prt_printf(out, "l=%u locks %s seq %u node ", l,
-			   btree_node_locked_str(btree_node_locked_type(path, l)),
-			   path->l[l].lock_seq);
-
-		int ret = PTR_ERR_OR_ZERO(path->l[l].b);
-		if (ret)
-			prt_str(out, bch2_err_str(ret));
-		else
-			prt_printf(out, "%px", path->l[l].b);
-		prt_newline(out);
-	}
-	printbuf_indent_sub(out, 2);
-}
-
-static noinline __cold
-void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans,
-				bool nosort)
-{
-	struct trans_for_each_path_inorder_iter iter;
-
-	if (!nosort)
-		btree_trans_sort_paths(trans);
-
-	trans_for_each_path_idx_inorder(trans, iter) {
-		bch2_btree_path_to_text_short(out, trans, iter.path_idx);
-		prt_newline(out);
-	}
-}
-
-noinline __cold
-void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
-{
-	__bch2_trans_paths_to_text(out, trans, false);
-}
-
-static noinline __cold
-void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
-{
-	struct printbuf buf = PRINTBUF;
-
-	__bch2_trans_paths_to_text(&buf, trans, nosort);
-	bch2_trans_updates_to_text(&buf, trans);
-
-	bch2_print_string_as_lines(KERN_ERR, buf.buf);
-	printbuf_exit(&buf);
-}
-
-noinline __cold
-void bch2_dump_trans_paths_updates(struct btree_trans *trans)
-{
-	__bch2_dump_trans_paths_updates(trans, false);
-}
-
-noinline __cold
-static void bch2_trans_update_max_paths(struct btree_trans *trans)
-{
-	struct btree_transaction_stats *s = btree_trans_stats(trans);
-	struct printbuf buf = PRINTBUF;
-	size_t nr = bitmap_weight(trans->paths_allocated, trans->nr_paths);
-
-	bch2_trans_paths_to_text(&buf, trans);
-
-	if (!buf.allocation_failure) {
-		mutex_lock(&s->lock);
-		if (nr > s->nr_max_paths) {
-			s->nr_max_paths = nr;
-			swap(s->max_paths_text, buf.buf);
-		}
-		mutex_unlock(&s->lock);
-	}
-
-	printbuf_exit(&buf);
-
-	trans->nr_paths_max = nr;
-}
-
-noinline __cold
-int __bch2_btree_trans_too_many_iters(struct btree_trans *trans)
-{
-	if (trace_trans_restart_too_many_iters_enabled()) {
-		struct printbuf buf = PRINTBUF;
-
-		bch2_trans_paths_to_text(&buf, trans);
-		trace_trans_restart_too_many_iters(trans, _THIS_IP_, buf.buf);
-		printbuf_exit(&buf);
-	}
-
-	count_event(trans->c, trans_restart_too_many_iters);
-
-	return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
-}
-
-static noinline void btree_path_overflow(struct btree_trans *trans)
-{
-	bch2_dump_trans_paths_updates(trans);
-	bch_err(trans->c, "trans path overflow");
-}
-
-static noinline void btree_paths_realloc(struct btree_trans *trans)
-{
-	unsigned nr = trans->nr_paths * 2;
-
-	void *p = kvzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
-			  sizeof(struct btree_trans_paths) +
-			  nr * sizeof(struct btree_path) +
-			  nr * sizeof(btree_path_idx_t) + 8 +
-			  nr * sizeof(struct btree_insert_entry), GFP_KERNEL|__GFP_NOFAIL);
-
-	unsigned long *paths_allocated = p;
-	memcpy(paths_allocated, trans->paths_allocated, BITS_TO_LONGS(trans->nr_paths) * sizeof(unsigned long));
-	p += BITS_TO_LONGS(nr) * sizeof(unsigned long);
-
-	p += sizeof(struct btree_trans_paths);
-	struct btree_path *paths = p;
-	*trans_paths_nr(paths) = nr;
-	memcpy(paths, trans->paths, trans->nr_paths * sizeof(struct btree_path));
-	p += nr * sizeof(struct btree_path);
-
-	btree_path_idx_t *sorted = p;
-	memcpy(sorted, trans->sorted, trans->nr_sorted * sizeof(btree_path_idx_t));
-	p += nr * sizeof(btree_path_idx_t) + 8;
-
-	struct btree_insert_entry *updates = p;
-	memcpy(updates, trans->updates, trans->nr_paths * sizeof(struct btree_insert_entry));
-
-	unsigned long *old = trans->paths_allocated;
-
-	rcu_assign_pointer(trans->paths_allocated,	paths_allocated);
-	rcu_assign_pointer(trans->paths,		paths);
-	rcu_assign_pointer(trans->sorted,		sorted);
-	rcu_assign_pointer(trans->updates,		updates);
-
-	trans->nr_paths		= nr;
-
-	if (old != trans->_paths_allocated)
-		kfree_rcu_mightsleep(old);
-}
-
-static inline btree_path_idx_t btree_path_alloc(struct btree_trans *trans,
-						btree_path_idx_t pos)
-{
-	btree_path_idx_t idx = find_first_zero_bit(trans->paths_allocated, trans->nr_paths);
-
-	if (unlikely(idx == trans->nr_paths)) {
-		if (trans->nr_paths == BTREE_ITER_MAX) {
-			btree_path_overflow(trans);
-			return 0;
-		}
-
-		btree_paths_realloc(trans);
-	}
-
-	/*
-	 * Do this before marking the new path as allocated, since it won't be
-	 * initialized yet:
-	 */
-	if (unlikely(idx > trans->nr_paths_max))
-		bch2_trans_update_max_paths(trans);
-
-	__set_bit(idx, trans->paths_allocated);
-
-	struct btree_path *path = &trans->paths[idx];
-	path->ref		= 0;
-	path->intent_ref	= 0;
-	path->nodes_locked	= 0;
-
-	btree_path_list_add(trans, pos, idx);
-	trans->paths_sorted = false;
-	return idx;
-}
-
-btree_path_idx_t bch2_path_get(struct btree_trans *trans,
-			     enum btree_id btree_id, struct bpos pos,
-			     unsigned locks_want, unsigned level,
-			     unsigned flags, unsigned long ip)
-{
-	struct btree_path *path;
-	bool cached = flags & BTREE_ITER_cached;
-	bool intent = flags & BTREE_ITER_intent;
-	struct trans_for_each_path_inorder_iter iter;
-	btree_path_idx_t path_pos = 0, path_idx;
-
-	bch2_trans_verify_not_unlocked(trans);
-	bch2_trans_verify_not_in_restart(trans);
-	bch2_trans_verify_locks(trans);
-
-	btree_trans_sort_paths(trans);
-
-	trans_for_each_path_inorder(trans, path, iter) {
-		if (__btree_path_cmp(path,
-				     btree_id,
-				     cached,
-				     pos,
-				     level) > 0)
-			break;
-
-		path_pos = iter.path_idx;
-	}
-
-	if (path_pos &&
-	    trans->paths[path_pos].cached	== cached &&
-	    trans->paths[path_pos].btree_id	== btree_id &&
-	    trans->paths[path_pos].level	== level) {
-		__btree_path_get(trans->paths + path_pos, intent);
-		path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
-		path = trans->paths + path_idx;
-	} else {
-		path_idx = btree_path_alloc(trans, path_pos);
-		path = trans->paths + path_idx;
-
-		__btree_path_get(path, intent);
-		path->pos			= pos;
-		path->btree_id			= btree_id;
-		path->cached			= cached;
-		path->uptodate			= BTREE_ITER_NEED_TRAVERSE;
-		path->should_be_locked		= false;
-		path->level			= level;
-		path->locks_want		= locks_want;
-		path->nodes_locked		= 0;
-		for (unsigned i = 0; i < ARRAY_SIZE(path->l); i++)
-			path->l[i].b		= ERR_PTR(-BCH_ERR_no_btree_node_init);
-#ifdef TRACK_PATH_ALLOCATED
-		path->ip_allocated		= ip;
-#endif
-		trans->paths_sorted		= false;
-	}
-
-	if (!(flags & BTREE_ITER_nopreserve))
-		path->preserve = true;
-
-	if (path->intent_ref)
-		locks_want = max(locks_want, level + 1);
-
-	/*
-	 * If the path has locks_want greater than requested, we don't downgrade
-	 * it here - on transaction restart because btree node split needs to
-	 * upgrade locks, we might be putting/getting the iterator again.
-	 * Downgrading iterators only happens via bch2_trans_downgrade(), after
-	 * a successful transaction commit.
-	 */
-
-	locks_want = min(locks_want, BTREE_MAX_DEPTH);
-	if (locks_want > path->locks_want)
-		bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL);
-
-	return path_idx;
-}
-
-btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *trans,
-					    enum btree_id btree_id,
-					    unsigned level,
-					    struct bpos pos)
-{
-	btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level,
-			     BTREE_ITER_nopreserve|
-			     BTREE_ITER_intent, _RET_IP_);
-	path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_);
-
-	struct btree_path *path = trans->paths + path_idx;
-	bch2_btree_path_downgrade(trans, path);
-	__bch2_btree_path_unlock(trans, path);
-	return path_idx;
-}
-
-struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
-{
-
-	struct btree_path_level *l = path_l(path);
-	struct bkey_packed *_k;
-	struct bkey_s_c k;
-
-	if (unlikely(!l->b))
-		return bkey_s_c_null;
-
-	EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
-	EBUG_ON(!btree_node_locked(path, path->level));
-
-	if (!path->cached) {
-		_k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
-		k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
-
-		EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos));
-
-		if (!k.k || !bpos_eq(path->pos, k.k->p))
-			goto hole;
-	} else {
-		struct bkey_cached *ck = (void *) path->l[0].b;
-
-		EBUG_ON(ck &&
-			(path->btree_id != ck->key.btree_id ||
-			 !bkey_eq(path->pos, ck->key.pos)));
-		if (!ck || !ck->valid)
-			return bkey_s_c_null;
-
-		*u = ck->k->k;
-		k = bkey_i_to_s_c(ck->k);
-	}
-
-	return k;
-hole:
-	bkey_init(u);
-	u->p = path->pos;
-	return (struct bkey_s_c) { u, NULL };
-}
-
-
-void bch2_set_btree_iter_dontneed(struct btree_iter *iter)
-{
-	struct btree_trans *trans = iter->trans;
-
-	if (!iter->path || trans->restarted)
-		return;
-
-	struct btree_path *path = btree_iter_path(trans, iter);
-	path->preserve		= false;
-	if (path->ref == 1)
-		path->should_be_locked	= false;
-}
-/* Btree iterators: */
-
-int __must_check
-__bch2_btree_iter_traverse(struct btree_iter *iter)
-{
-	return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
-}
-
-int __must_check
-bch2_btree_iter_traverse(struct btree_iter *iter)
-{
-	struct btree_trans *trans = iter->trans;
-	int ret;
-
-	bch2_trans_verify_not_unlocked(trans);
-
-	iter->path = bch2_btree_path_set_pos(trans, iter->path,
-					btree_iter_search_key(iter),
-					iter->flags & BTREE_ITER_intent,
-					btree_iter_ip_allocated(iter));
-
-	ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
-	if (ret)
-		return ret;
-
-	struct btree_path *path = btree_iter_path(trans, iter);
-	if (btree_path_node(path, path->level))
-		btree_path_set_should_be_locked(path);
-	return 0;
-}
-
-/* Iterate across nodes (leaf and interior nodes) */
-
-struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
-{
-	struct btree_trans *trans = iter->trans;
-	struct btree *b = NULL;
-	int ret;
-
-	EBUG_ON(trans->paths[iter->path].cached);
-	bch2_btree_iter_verify(iter);
-
-	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
-	if (ret)
-		goto err;
-
-	struct btree_path *path = btree_iter_path(trans, iter);
-	b = btree_path_node(path, path->level);
-	if (!b)
-		goto out;
-
-	BUG_ON(bpos_lt(b->key.k.p, iter->pos));
-
-	bkey_init(&iter->k);
-	iter->k.p = iter->pos = b->key.k.p;
-
-	iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
-					iter->flags & BTREE_ITER_intent,
-					btree_iter_ip_allocated(iter));
-	btree_path_set_should_be_locked(btree_iter_path(trans, iter));
-out:
-	bch2_btree_iter_verify_entry_exit(iter);
-	bch2_btree_iter_verify(iter);
-
-	return b;
-err:
-	b = ERR_PTR(ret);
-	goto out;
-}
-
-struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter)
-{
-	struct btree *b;
-
-	while (b = bch2_btree_iter_peek_node(iter),
-	       bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
-		bch2_trans_begin(iter->trans);
-
-	return b;
-}
-
-struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
-{
-	struct btree_trans *trans = iter->trans;
-	struct btree *b = NULL;
-	int ret;
-
-	EBUG_ON(trans->paths[iter->path].cached);
-	bch2_trans_verify_not_in_restart(trans);
-	bch2_btree_iter_verify(iter);
-
-	struct btree_path *path = btree_iter_path(trans, iter);
-
-	/* already at end? */
-	if (!btree_path_node(path, path->level))
-		return NULL;
-
-	/* got to end? */
-	if (!btree_path_node(path, path->level + 1)) {
-		btree_path_set_level_up(trans, path);
-		return NULL;
-	}
-
-	if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
-		__bch2_btree_path_unlock(trans, path);
-		path->l[path->level].b		= ERR_PTR(-BCH_ERR_no_btree_node_relock);
-		path->l[path->level + 1].b	= ERR_PTR(-BCH_ERR_no_btree_node_relock);
-		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-		trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
-		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
-		goto err;
-	}
-
-	b = btree_path_node(path, path->level + 1);
-
-	if (bpos_eq(iter->pos, b->key.k.p)) {
-		__btree_path_set_level_up(trans, path, path->level++);
-	} else {
-		if (btree_lock_want(path, path->level + 1) == BTREE_NODE_UNLOCKED)
-			btree_node_unlock(trans, path, path->level + 1);
-
-		/*
-		 * Haven't gotten to the end of the parent node: go back down to
-		 * the next child node
-		 */
-		iter->path = bch2_btree_path_set_pos(trans, iter->path,
-					bpos_successor(iter->pos),
-					iter->flags & BTREE_ITER_intent,
-					btree_iter_ip_allocated(iter));
-
-		path = btree_iter_path(trans, iter);
-		btree_path_set_level_down(trans, path, iter->min_depth);
-
-		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
-		if (ret)
-			goto err;
-
-		path = btree_iter_path(trans, iter);
-		b = path->l[path->level].b;
-	}
-
-	bkey_init(&iter->k);
-	iter->k.p = iter->pos = b->key.k.p;
-
-	iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
-					iter->flags & BTREE_ITER_intent,
-					btree_iter_ip_allocated(iter));
-	btree_path_set_should_be_locked(btree_iter_path(trans, iter));
-	EBUG_ON(btree_iter_path(trans, iter)->uptodate);
-out:
-	bch2_btree_iter_verify_entry_exit(iter);
-	bch2_btree_iter_verify(iter);
-
-	return b;
-err:
-	b = ERR_PTR(ret);
-	goto out;
-}
-
-/* Iterate across keys (in leaf nodes only) */
-
-inline bool bch2_btree_iter_advance(struct btree_iter *iter)
-{
-	struct bpos pos = iter->k.p;
-	bool ret = !(iter->flags & BTREE_ITER_all_snapshots
-		     ? bpos_eq(pos, SPOS_MAX)
-		     : bkey_eq(pos, SPOS_MAX));
-
-	if (ret && !(iter->flags & BTREE_ITER_is_extents))
-		pos = bkey_successor(iter, pos);
-	bch2_btree_iter_set_pos(iter, pos);
-	return ret;
-}
-
-inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
-{
-	struct bpos pos = bkey_start_pos(&iter->k);
-	bool ret = !(iter->flags & BTREE_ITER_all_snapshots
-		     ? bpos_eq(pos, POS_MIN)
-		     : bkey_eq(pos, POS_MIN));
-
-	if (ret && !(iter->flags & BTREE_ITER_is_extents))
-		pos = bkey_predecessor(iter, pos);
-	bch2_btree_iter_set_pos(iter, pos);
-	return ret;
-}
-
-static noinline
-void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter,
-					struct bkey_s_c *k)
-{
-	struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key;
-
-	trans_for_each_update(trans, i)
-		if (!i->key_cache_already_flushed &&
-		    i->btree_id == iter->btree_id &&
-		    bpos_le(i->k->k.p, iter->pos) &&
-		    bpos_ge(i->k->k.p, k->k ? k->k->p : end)) {
-			iter->k = i->k->k;
-			*k = bkey_i_to_s_c(i->k);
-		}
-}
-
-static noinline
-void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter,
-				   struct bkey_s_c *k)
-{
-	struct btree_path *path = btree_iter_path(trans, iter);
-	struct bpos end = path_l(path)->b->key.k.p;
-
-	trans_for_each_update(trans, i)
-		if (!i->key_cache_already_flushed &&
-		    i->btree_id == iter->btree_id &&
-		    bpos_ge(i->k->k.p, path->pos) &&
-		    bpos_le(i->k->k.p, k->k ? k->k->p : end)) {
-			iter->k = i->k->k;
-			*k = bkey_i_to_s_c(i->k);
-		}
-}
-
-static noinline
-void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_iter *iter,
-					struct bkey_s_c *k)
-{
-	trans_for_each_update(trans, i)
-		if (!i->key_cache_already_flushed &&
-		    i->btree_id == iter->btree_id &&
-		    bpos_eq(i->k->k.p, iter->pos)) {
-			iter->k = i->k->k;
-			*k = bkey_i_to_s_c(i->k);
-		}
-}
-
-static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
-					      struct btree_iter *iter,
-					      struct bpos end_pos)
-{
-	struct btree_path *path = btree_iter_path(trans, iter);
-
-	return bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
-					   path->level,
-					   path->pos,
-					   end_pos,
-					   &iter->journal_idx);
-}
-
-static noinline
-struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
-					      struct btree_iter *iter)
-{
-	struct btree_path *path = btree_iter_path(trans, iter);
-	struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos);
-
-	if (k) {
-		iter->k = k->k;
-		return bkey_i_to_s_c(k);
-	} else {
-		return bkey_s_c_null;
-	}
-}
-
-static noinline
-struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
-					 struct btree_iter *iter,
-					 struct bkey_s_c k)
-{
-	struct btree_path *path = btree_iter_path(trans, iter);
-	struct bkey_i *next_journal =
-		bch2_btree_journal_peek(trans, iter,
-				k.k ? k.k->p : path_l(path)->b->key.k.p);
-
-	if (next_journal) {
-		iter->k = next_journal->k;
-		k = bkey_i_to_s_c(next_journal);
-	}
-
-	return k;
-}
-
-/*
- * Checks btree key cache for key at iter->pos and returns it if present, or
- * bkey_s_c_null:
- */
-static noinline
-struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
-{
-	struct btree_trans *trans = iter->trans;
-	struct bch_fs *c = trans->c;
-	struct bkey u;
-	struct bkey_s_c k;
-	int ret;
-
-	bch2_trans_verify_not_in_restart(trans);
-	bch2_trans_verify_not_unlocked(trans);
-
-	if ((iter->flags & BTREE_ITER_key_cache_fill) &&
-	    bpos_eq(iter->pos, pos))
-		return bkey_s_c_null;
-
-	if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
-		return bkey_s_c_null;
-
-	if (!iter->key_cache_path)
-		iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
-						     iter->flags & BTREE_ITER_intent, 0,
-						     iter->flags|BTREE_ITER_cached|
-						     BTREE_ITER_cached_nofill,
-						     _THIS_IP_);
-
-	iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
-					iter->flags & BTREE_ITER_intent,
-					btree_iter_ip_allocated(iter));
-
-	ret =   bch2_btree_path_traverse(trans, iter->key_cache_path,
-					 iter->flags|BTREE_ITER_cached) ?:
-		bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_);
-	if (unlikely(ret))
-		return bkey_s_c_err(ret);
-
-	btree_path_set_should_be_locked(trans->paths + iter->key_cache_path);
-
-	k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
-	if (k.k && !bkey_err(k)) {
-		iter->k = u;
-		k.k = &iter->k;
-	}
-	return k;
-}
-
-static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
-{
-	struct btree_trans *trans = iter->trans;
-	struct bkey_s_c k, k2;
-	int ret;
-
-	EBUG_ON(btree_iter_path(trans, iter)->cached);
-	bch2_btree_iter_verify(iter);
-
-	while (1) {
-		struct btree_path_level *l;
-
-		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
-					iter->flags & BTREE_ITER_intent,
-					btree_iter_ip_allocated(iter));
-
-		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
-		if (unlikely(ret)) {
-			/* ensure that iter->k is consistent with iter->pos: */
-			bch2_btree_iter_set_pos(iter, iter->pos);
-			k = bkey_s_c_err(ret);
-			goto out;
-		}
-
-		struct btree_path *path = btree_iter_path(trans, iter);
-		l = path_l(path);
-
-		if (unlikely(!l->b)) {
-			/* No btree nodes at requested level: */
-			bch2_btree_iter_set_pos(iter, SPOS_MAX);
-			k = bkey_s_c_null;
-			goto out;
-		}
-
-		btree_path_set_should_be_locked(path);
-
-		k = btree_path_level_peek_all(trans->c, l, &iter->k);
-
-		if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
-		    k.k &&
-		    (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
-			k = k2;
-			ret = bkey_err(k);
-			if (ret) {
-				bch2_btree_iter_set_pos(iter, iter->pos);
-				goto out;
-			}
-		}
-
-		if (unlikely(iter->flags & BTREE_ITER_with_journal))
-			k = btree_trans_peek_journal(trans, iter, k);
-
-		if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
-			     trans->nr_updates))
-			bch2_btree_trans_peek_updates(trans, iter, &k);
-
-		if (k.k && bkey_deleted(k.k)) {
-			/*
-			 * If we've got a whiteout, and it's after the search
-			 * key, advance the search key to the whiteout instead
-			 * of just after the whiteout - it might be a btree
-			 * whiteout, with a real key at the same position, since
-			 * in the btree deleted keys sort before non deleted.
-			 */
-			search_key = !bpos_eq(search_key, k.k->p)
-				? k.k->p
-				: bpos_successor(k.k->p);
-			continue;
-		}
-
-		if (likely(k.k)) {
-			break;
-		} else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) {
-			/* Advance to next leaf node: */
-			search_key = bpos_successor(l->b->key.k.p);
-		} else {
-			/* End of btree: */
-			bch2_btree_iter_set_pos(iter, SPOS_MAX);
-			k = bkey_s_c_null;
-			goto out;
-		}
-	}
-out:
-	bch2_btree_iter_verify(iter);
-
-	return k;
-}
-
-/**
- * bch2_btree_iter_peek_upto() - returns first key greater than or equal to
- * iterator's current position
- * @iter:	iterator to peek from
- * @end:	search limit: returns keys less than or equal to @end
- *
- * Returns:	key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
-{
-	struct btree_trans *trans = iter->trans;
-	struct bpos search_key = btree_iter_search_key(iter);
-	struct bkey_s_c k;
-	struct bpos iter_pos;
-	int ret;
-
-	bch2_trans_verify_not_unlocked(trans);
-	EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX));
-
-	if (iter->update_path) {
-		bch2_path_put_nokeep(trans, iter->update_path,
-				     iter->flags & BTREE_ITER_intent);
-		iter->update_path = 0;
-	}
-
-	bch2_btree_iter_verify_entry_exit(iter);
-
-	while (1) {
-		k = __bch2_btree_iter_peek(iter, search_key);
-		if (unlikely(!k.k))
-			goto end;
-		if (unlikely(bkey_err(k)))
-			goto out_no_locked;
-
-		/*
-		 * We need to check against @end before FILTER_SNAPSHOTS because
-		 * if we get to a different inode that requested we might be
-		 * seeing keys for a different snapshot tree that will all be
-		 * filtered out.
-		 *
-		 * But we can't do the full check here, because bkey_start_pos()
-		 * isn't monotonically increasing before FILTER_SNAPSHOTS, and
-		 * that's what we check against in extents mode:
-		 */
-		if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
-			     ? bkey_gt(k.k->p, end)
-			     : k.k->p.inode > end.inode))
-			goto end;
-
-		if (iter->update_path &&
-		    !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
-			bch2_path_put_nokeep(trans, iter->update_path,
-					     iter->flags & BTREE_ITER_intent);
-			iter->update_path = 0;
-		}
-
-		if ((iter->flags & BTREE_ITER_filter_snapshots) &&
-		    (iter->flags & BTREE_ITER_intent) &&
-		    !(iter->flags & BTREE_ITER_is_extents) &&
-		    !iter->update_path) {
-			struct bpos pos = k.k->p;
-
-			if (pos.snapshot < iter->snapshot) {
-				search_key = bpos_successor(k.k->p);
-				continue;
-			}
-
-			pos.snapshot = iter->snapshot;
-
-			/*
-			 * advance, same as on exit for iter->path, but only up
-			 * to snapshot
-			 */
-			__btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
-			iter->update_path = iter->path;
-
-			iter->update_path = bch2_btree_path_set_pos(trans,
-						iter->update_path, pos,
-						iter->flags & BTREE_ITER_intent,
-						_THIS_IP_);
-			ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
-			if (unlikely(ret)) {
-				k = bkey_s_c_err(ret);
-				goto out_no_locked;
-			}
-		}
-
-		/*
-		 * We can never have a key in a leaf node at POS_MAX, so
-		 * we don't have to check these successor() calls:
-		 */
-		if ((iter->flags & BTREE_ITER_filter_snapshots) &&
-		    !bch2_snapshot_is_ancestor(trans->c,
-					       iter->snapshot,
-					       k.k->p.snapshot)) {
-			search_key = bpos_successor(k.k->p);
-			continue;
-		}
-
-		if (bkey_whiteout(k.k) &&
-		    !(iter->flags & BTREE_ITER_all_snapshots)) {
-			search_key = bkey_successor(iter, k.k->p);
-			continue;
-		}
-
-		/*
-		 * iter->pos should be mononotically increasing, and always be
-		 * equal to the key we just returned - except extents can
-		 * straddle iter->pos:
-		 */
-		if (!(iter->flags & BTREE_ITER_is_extents))
-			iter_pos = k.k->p;
-		else
-			iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
-
-		if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
-			     ? bkey_gt(iter_pos, end)
-			     : bkey_ge(iter_pos, end)))
-			goto end;
-
-		break;
-	}
-
-	iter->pos = iter_pos;
-
-	iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
-				iter->flags & BTREE_ITER_intent,
-				btree_iter_ip_allocated(iter));
-
-	btree_path_set_should_be_locked(btree_iter_path(trans, iter));
-out_no_locked:
-	if (iter->update_path) {
-		ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_);
-		if (unlikely(ret))
-			k = bkey_s_c_err(ret);
-		else
-			btree_path_set_should_be_locked(trans->paths + iter->update_path);
-	}
-
-	if (!(iter->flags & BTREE_ITER_all_snapshots))
-		iter->pos.snapshot = iter->snapshot;
-
-	ret = bch2_btree_iter_verify_ret(iter, k);
-	if (unlikely(ret)) {
-		bch2_btree_iter_set_pos(iter, iter->pos);
-		k = bkey_s_c_err(ret);
-	}
-
-	bch2_btree_iter_verify_entry_exit(iter);
-
-	return k;
-end:
-	bch2_btree_iter_set_pos(iter, end);
-	k = bkey_s_c_null;
-	goto out_no_locked;
-}
-
-/**
- * bch2_btree_iter_next() - returns first key greater than iterator's current
- * position
- * @iter:	iterator to peek from
- *
- * Returns:	key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
-{
-	if (!bch2_btree_iter_advance(iter))
-		return bkey_s_c_null;
-
-	return bch2_btree_iter_peek(iter);
-}
-
-/**
- * bch2_btree_iter_peek_prev() - returns first key less than or equal to
- * iterator's current position
- * @iter:	iterator to peek from
- *
- * Returns:	key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
-{
-	struct btree_trans *trans = iter->trans;
-	struct bpos search_key = iter->pos;
-	struct bkey_s_c k;
-	struct bkey saved_k;
-	const struct bch_val *saved_v;
-	btree_path_idx_t saved_path = 0;
-	int ret;
-
-	bch2_trans_verify_not_unlocked(trans);
-	EBUG_ON(btree_iter_path(trans, iter)->cached ||
-		btree_iter_path(trans, iter)->level);
-
-	if (iter->flags & BTREE_ITER_with_journal)
-		return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported);
-
-	bch2_btree_iter_verify(iter);
-	bch2_btree_iter_verify_entry_exit(iter);
-
-	if (iter->flags & BTREE_ITER_filter_snapshots)
-		search_key.snapshot = U32_MAX;
-
-	while (1) {
-		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
-						iter->flags & BTREE_ITER_intent,
-						btree_iter_ip_allocated(iter));
-
-		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
-		if (unlikely(ret)) {
-			/* ensure that iter->k is consistent with iter->pos: */
-			bch2_btree_iter_set_pos(iter, iter->pos);
-			k = bkey_s_c_err(ret);
-			goto out_no_locked;
-		}
-
-		struct btree_path *path = btree_iter_path(trans, iter);
-
-		k = btree_path_level_peek(trans, path, &path->l[0], &iter->k);
-		if (!k.k ||
-		    ((iter->flags & BTREE_ITER_is_extents)
-		     ? bpos_ge(bkey_start_pos(k.k), search_key)
-		     : bpos_gt(k.k->p, search_key)))
-			k = btree_path_level_prev(trans, path, &path->l[0], &iter->k);
-
-		if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
-			     trans->nr_updates))
-			bch2_btree_trans_peek_prev_updates(trans, iter, &k);
-
-		if (likely(k.k)) {
-			if (iter->flags & BTREE_ITER_filter_snapshots) {
-				if (k.k->p.snapshot == iter->snapshot)
-					goto got_key;
-
-				/*
-				 * If we have a saved candidate, and we're no
-				 * longer at the same _key_ (not pos), return
-				 * that candidate
-				 */
-				if (saved_path && !bkey_eq(k.k->p, saved_k.p)) {
-					bch2_path_put_nokeep(trans, iter->path,
-						      iter->flags & BTREE_ITER_intent);
-					iter->path = saved_path;
-					saved_path = 0;
-					iter->k	= saved_k;
-					k.v	= saved_v;
-					goto got_key;
-				}
-
-				if (bch2_snapshot_is_ancestor(trans->c,
-							      iter->snapshot,
-							      k.k->p.snapshot)) {
-					if (saved_path)
-						bch2_path_put_nokeep(trans, saved_path,
-						      iter->flags & BTREE_ITER_intent);
-					saved_path = btree_path_clone(trans, iter->path,
-								iter->flags & BTREE_ITER_intent,
-								_THIS_IP_);
-					path = btree_iter_path(trans, iter);
-					saved_k = *k.k;
-					saved_v = k.v;
-				}
-
-				search_key = bpos_predecessor(k.k->p);
-				continue;
-			}
-got_key:
-			if (bkey_whiteout(k.k) &&
-			    !(iter->flags & BTREE_ITER_all_snapshots)) {
-				search_key = bkey_predecessor(iter, k.k->p);
-				if (iter->flags & BTREE_ITER_filter_snapshots)
-					search_key.snapshot = U32_MAX;
-				continue;
-			}
-
-			btree_path_set_should_be_locked(path);
-			break;
-		} else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
-			/* Advance to previous leaf node: */
-			search_key = bpos_predecessor(path->l[0].b->data->min_key);
-		} else {
-			/* Start of btree: */
-			bch2_btree_iter_set_pos(iter, POS_MIN);
-			k = bkey_s_c_null;
-			goto out_no_locked;
-		}
-	}
-
-	EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos));
-
-	/* Extents can straddle iter->pos: */
-	if (bkey_lt(k.k->p, iter->pos))
-		iter->pos = k.k->p;
-
-	if (iter->flags & BTREE_ITER_filter_snapshots)
-		iter->pos.snapshot = iter->snapshot;
-out_no_locked:
-	if (saved_path)
-		bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent);
-
-	bch2_btree_iter_verify_entry_exit(iter);
-	bch2_btree_iter_verify(iter);
-
-	return k;
-}
-
-/**
- * bch2_btree_iter_prev() - returns first key less than iterator's current
- * position
- * @iter:	iterator to peek from
- *
- * Returns:	key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
-{
-	if (!bch2_btree_iter_rewind(iter))
-		return bkey_s_c_null;
-
-	return bch2_btree_iter_peek_prev(iter);
-}
-
-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
-{
-	struct btree_trans *trans = iter->trans;
-	struct bpos search_key;
-	struct bkey_s_c k;
-	int ret;
-
-	bch2_trans_verify_not_unlocked(trans);
-	bch2_btree_iter_verify(iter);
-	bch2_btree_iter_verify_entry_exit(iter);
-	EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache));
-
-	/* extents can't span inode numbers: */
-	if ((iter->flags & BTREE_ITER_is_extents) &&
-	    unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
-		if (iter->pos.inode == KEY_INODE_MAX)
-			return bkey_s_c_null;
-
-		bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
-	}
-
-	search_key = btree_iter_search_key(iter);
-	iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
-					iter->flags & BTREE_ITER_intent,
-					btree_iter_ip_allocated(iter));
-
-	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
-	if (unlikely(ret)) {
-		k = bkey_s_c_err(ret);
-		goto out_no_locked;
-	}
-
-	if ((iter->flags & BTREE_ITER_cached) ||
-	    !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) {
-		k = bkey_s_c_null;
-
-		if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
-			     trans->nr_updates)) {
-			bch2_btree_trans_peek_slot_updates(trans, iter, &k);
-			if (k.k)
-				goto out;
-		}
-
-		if (unlikely(iter->flags & BTREE_ITER_with_journal) &&
-		    (k = btree_trans_peek_slot_journal(trans, iter)).k)
-			goto out;
-
-		if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
-		    (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
-			if (!bkey_err(k))
-				iter->k = *k.k;
-			/* We're not returning a key from iter->path: */
-			goto out_no_locked;
-		}
-
-		k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k);
-		if (unlikely(!k.k))
-			goto out_no_locked;
-	} else {
-		struct bpos next;
-		struct bpos end = iter->pos;
-
-		if (iter->flags & BTREE_ITER_is_extents)
-			end.offset = U64_MAX;
-
-		EBUG_ON(btree_iter_path(trans, iter)->level);
-
-		if (iter->flags & BTREE_ITER_intent) {
-			struct btree_iter iter2;
-
-			bch2_trans_copy_iter(&iter2, iter);
-			k = bch2_btree_iter_peek_upto(&iter2, end);
-
-			if (k.k && !bkey_err(k)) {
-				swap(iter->key_cache_path, iter2.key_cache_path);
-				iter->k = iter2.k;
-				k.k = &iter->k;
-			}
-			bch2_trans_iter_exit(trans, &iter2);
-		} else {
-			struct bpos pos = iter->pos;
-
-			k = bch2_btree_iter_peek_upto(iter, end);
-			if (unlikely(bkey_err(k)))
-				bch2_btree_iter_set_pos(iter, pos);
-			else
-				iter->pos = pos;
-		}
-
-		if (unlikely(bkey_err(k)))
-			goto out_no_locked;
-
-		next = k.k ? bkey_start_pos(k.k) : POS_MAX;
-
-		if (bkey_lt(iter->pos, next)) {
-			bkey_init(&iter->k);
-			iter->k.p = iter->pos;
-
-			if (iter->flags & BTREE_ITER_is_extents) {
-				bch2_key_resize(&iter->k,
-						min_t(u64, KEY_SIZE_MAX,
-						      (next.inode == iter->pos.inode
-						       ? next.offset
-						       : KEY_OFFSET_MAX) -
-						      iter->pos.offset));
-				EBUG_ON(!iter->k.size);
-			}
-
-			k = (struct bkey_s_c) { &iter->k, NULL };
-		}
-	}
-out:
-	btree_path_set_should_be_locked(btree_iter_path(trans, iter));
-out_no_locked:
-	bch2_btree_iter_verify_entry_exit(iter);
-	bch2_btree_iter_verify(iter);
-	ret = bch2_btree_iter_verify_ret(iter, k);
-	if (unlikely(ret))
-		return bkey_s_c_err(ret);
-
-	return k;
-}
-
-struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
-{
-	if (!bch2_btree_iter_advance(iter))
-		return bkey_s_c_null;
-
-	return bch2_btree_iter_peek_slot(iter);
-}
-
-struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
-{
-	if (!bch2_btree_iter_rewind(iter))
-		return bkey_s_c_null;
-
-	return bch2_btree_iter_peek_slot(iter);
-}
-
-struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter)
-{
-	struct bkey_s_c k;
-
-	while (btree_trans_too_many_iters(iter->trans) ||
-	       (k = bch2_btree_iter_peek_type(iter, iter->flags),
-		bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
-		bch2_trans_begin(iter->trans);
-
-	return k;
-}
-
-/* new transactional stuff: */
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
-{
-	struct btree_path *path;
-	unsigned i;
-
-	BUG_ON(trans->nr_sorted != bitmap_weight(trans->paths_allocated, trans->nr_paths) - 1);
-
-	trans_for_each_path(trans, path, i) {
-		BUG_ON(path->sorted_idx >= trans->nr_sorted);
-		BUG_ON(trans->sorted[path->sorted_idx] != i);
-	}
-
-	for (i = 0; i < trans->nr_sorted; i++) {
-		unsigned idx = trans->sorted[i];
-
-		BUG_ON(!test_bit(idx, trans->paths_allocated));
-		BUG_ON(trans->paths[idx].sorted_idx != i);
-	}
-}
-
-static void btree_trans_verify_sorted(struct btree_trans *trans)
-{
-	struct btree_path *path, *prev = NULL;
-	struct trans_for_each_path_inorder_iter iter;
-
-	if (!bch2_debug_check_iterators)
-		return;
-
-	trans_for_each_path_inorder(trans, path, iter) {
-		if (prev && btree_path_cmp(prev, path) > 0) {
-			__bch2_dump_trans_paths_updates(trans, true);
-			panic("trans paths out of order!\n");
-		}
-		prev = path;
-	}
-}
-#else
-static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {}
-static inline void btree_trans_verify_sorted(struct btree_trans *trans) {}
-#endif
-
-void __bch2_btree_trans_sort_paths(struct btree_trans *trans)
-{
-	int i, l = 0, r = trans->nr_sorted, inc = 1;
-	bool swapped;
-
-	btree_trans_verify_sorted_refs(trans);
-
-	if (trans->paths_sorted)
-		goto out;
-
-	/*
-	 * Cocktail shaker sort: this is efficient because iterators will be
-	 * mostly sorted.
-	 */
-	do {
-		swapped = false;
-
-		for (i = inc > 0 ? l : r - 2;
-		     i + 1 < r && i >= l;
-		     i += inc) {
-			if (btree_path_cmp(trans->paths + trans->sorted[i],
-					   trans->paths + trans->sorted[i + 1]) > 0) {
-				swap(trans->sorted[i], trans->sorted[i + 1]);
-				trans->paths[trans->sorted[i]].sorted_idx = i;
-				trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1;
-				swapped = true;
-			}
-		}
-
-		if (inc > 0)
-			--r;
-		else
-			l++;
-		inc = -inc;
-	} while (swapped);
-
-	trans->paths_sorted = true;
-out:
-	btree_trans_verify_sorted(trans);
-}
-
-static inline void btree_path_list_remove(struct btree_trans *trans,
-					  struct btree_path *path)
-{
-	EBUG_ON(path->sorted_idx >= trans->nr_sorted);
-#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-	trans->nr_sorted--;
-	memmove_u64s_down_small(trans->sorted + path->sorted_idx,
-				trans->sorted + path->sorted_idx + 1,
-				DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx,
-					     sizeof(u64) / sizeof(btree_path_idx_t)));
-#else
-	array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
-#endif
-	for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++)
-		trans->paths[trans->sorted[i]].sorted_idx = i;
-}
-
-static inline void btree_path_list_add(struct btree_trans *trans,
-				       btree_path_idx_t pos,
-				       btree_path_idx_t path_idx)
-{
-	struct btree_path *path = trans->paths + path_idx;
-
-	path->sorted_idx = pos ? trans->paths[pos].sorted_idx + 1 : trans->nr_sorted;
-
-#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-	memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1,
-			      trans->sorted + path->sorted_idx,
-			      DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx,
-					   sizeof(u64) / sizeof(btree_path_idx_t)));
-	trans->nr_sorted++;
-	trans->sorted[path->sorted_idx] = path_idx;
-#else
-	array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path_idx);
-#endif
-
-	for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++)
-		trans->paths[trans->sorted[i]].sorted_idx = i;
-
-	btree_trans_verify_sorted_refs(trans);
-}
-
-void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
-{
-	if (iter->update_path)
-		bch2_path_put_nokeep(trans, iter->update_path,
-			      iter->flags & BTREE_ITER_intent);
-	if (iter->path)
-		bch2_path_put(trans, iter->path,
-			      iter->flags & BTREE_ITER_intent);
-	if (iter->key_cache_path)
-		bch2_path_put(trans, iter->key_cache_path,
-			      iter->flags & BTREE_ITER_intent);
-	iter->path		= 0;
-	iter->update_path	= 0;
-	iter->key_cache_path	= 0;
-	iter->trans		= NULL;
-}
-
-void bch2_trans_iter_init_outlined(struct btree_trans *trans,
-			  struct btree_iter *iter,
-			  enum btree_id btree_id, struct bpos pos,
-			  unsigned flags)
-{
-	bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
-			       bch2_btree_iter_flags(trans, btree_id, flags),
-			       _RET_IP_);
-}
-
-void bch2_trans_node_iter_init(struct btree_trans *trans,
-			       struct btree_iter *iter,
-			       enum btree_id btree_id,
-			       struct bpos pos,
-			       unsigned locks_want,
-			       unsigned depth,
-			       unsigned flags)
-{
-	flags |= BTREE_ITER_not_extents;
-	flags |= BTREE_ITER_snapshot_field;
-	flags |= BTREE_ITER_all_snapshots;
-
-	bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
-			       __bch2_btree_iter_flags(trans, btree_id, flags),
-			       _RET_IP_);
-
-	iter->min_depth	= depth;
-
-	struct btree_path *path = btree_iter_path(trans, iter);
-	BUG_ON(path->locks_want	 < min(locks_want, BTREE_MAX_DEPTH));
-	BUG_ON(path->level	!= depth);
-	BUG_ON(iter->min_depth	!= depth);
-}
-
-void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
-{
-	struct btree_trans *trans = src->trans;
-
-	*dst = *src;
-#ifdef TRACK_PATH_ALLOCATED
-	dst->ip_allocated = _RET_IP_;
-#endif
-	if (src->path)
-		__btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_intent);
-	if (src->update_path)
-		__btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_intent);
-	dst->key_cache_path = 0;
-}
-
-void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
-{
-	struct bch_fs *c = trans->c;
-	unsigned new_top = trans->mem_top + size;
-	unsigned old_bytes = trans->mem_bytes;
-	unsigned new_bytes = roundup_pow_of_two(new_top);
-	int ret;
-	void *new_mem;
-	void *p;
-
-	WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
-
-	struct btree_transaction_stats *s = btree_trans_stats(trans);
-	s->max_mem = max(s->max_mem, new_bytes);
-
-	if (trans->used_mempool) {
-		if (trans->mem_bytes >= new_bytes)
-			goto out_change_top;
-
-		/* No more space from mempool item, need malloc new one */
-		new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN);
-		if (unlikely(!new_mem)) {
-			bch2_trans_unlock(trans);
-
-			new_mem = kmalloc(new_bytes, GFP_KERNEL);
-			if (!new_mem)
-				return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
-
-			ret = bch2_trans_relock(trans);
-			if (ret) {
-				kfree(new_mem);
-				return ERR_PTR(ret);
-			}
-		}
-		memcpy(new_mem, trans->mem, trans->mem_top);
-		trans->used_mempool = false;
-		mempool_free(trans->mem, &c->btree_trans_mem_pool);
-		goto out_new_mem;
-	}
-
-	new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
-	if (unlikely(!new_mem)) {
-		bch2_trans_unlock(trans);
-
-		new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
-		if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
-			new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
-			new_bytes = BTREE_TRANS_MEM_MAX;
-			memcpy(new_mem, trans->mem, trans->mem_top);
-			trans->used_mempool = true;
-			kfree(trans->mem);
-		}
-
-		if (!new_mem)
-			return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
-
-		trans->mem = new_mem;
-		trans->mem_bytes = new_bytes;
-
-		ret = bch2_trans_relock(trans);
-		if (ret)
-			return ERR_PTR(ret);
-	}
-out_new_mem:
-	trans->mem = new_mem;
-	trans->mem_bytes = new_bytes;
-
-	if (old_bytes) {
-		trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
-		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
-	}
-out_change_top:
-	p = trans->mem + trans->mem_top;
-	trans->mem_top += size;
-	memset(p, 0, size);
-	return p;
-}
-
-static inline void check_srcu_held_too_long(struct btree_trans *trans)
-{
-	WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10),
-	     "btree trans held srcu lock (delaying memory reclaim) for %lu seconds",
-	     (jiffies - trans->srcu_lock_time) / HZ);
-}
-
-void bch2_trans_srcu_unlock(struct btree_trans *trans)
-{
-	if (trans->srcu_held) {
-		struct bch_fs *c = trans->c;
-		struct btree_path *path;
-		unsigned i;
-
-		trans_for_each_path(trans, path, i)
-			if (path->cached && !btree_node_locked(path, 0))
-				path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
-
-		check_srcu_held_too_long(trans);
-		srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
-		trans->srcu_held = false;
-	}
-}
-
-static void bch2_trans_srcu_lock(struct btree_trans *trans)
-{
-	if (!trans->srcu_held) {
-		trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier);
-		trans->srcu_lock_time	= jiffies;
-		trans->srcu_held = true;
-	}
-}
-
-/**
- * bch2_trans_begin() - reset a transaction after a interrupted attempt
- * @trans: transaction to reset
- *
- * Returns:	current restart counter, to be used with trans_was_restarted()
- *
- * While iterating over nodes or updating nodes a attempt to lock a btree node
- * may return BCH_ERR_transaction_restart when the trylock fails. When this
- * occurs bch2_trans_begin() should be called and the transaction retried.
- */
-u32 bch2_trans_begin(struct btree_trans *trans)
-{
-	struct btree_path *path;
-	unsigned i;
-	u64 now;
-
-	bch2_trans_reset_updates(trans);
-
-	trans->restart_count++;
-	trans->mem_top			= 0;
-	trans->journal_entries		= NULL;
-
-	trans_for_each_path(trans, path, i) {
-		path->should_be_locked = false;
-
-		/*
-		 * If the transaction wasn't restarted, we're presuming to be
-		 * doing something new: dont keep iterators excpt the ones that
-		 * are in use - except for the subvolumes btree:
-		 */
-		if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
-			path->preserve = false;
-
-		/*
-		 * XXX: we probably shouldn't be doing this if the transaction
-		 * was restarted, but currently we still overflow transaction
-		 * iterators if we do that
-		 */
-		if (!path->ref && !path->preserve)
-			__bch2_path_free(trans, i);
-		else
-			path->preserve = false;
-	}
-
-	now = local_clock();
-
-	if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) &&
-	    time_after64(now, trans->last_begin_time + 10))
-		__bch2_time_stats_update(&btree_trans_stats(trans)->duration,
-					 trans->last_begin_time, now);
-
-	if (!trans->restarted &&
-	    (need_resched() ||
-	     time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) {
-		bch2_trans_unlock(trans);
-		cond_resched();
-		now = local_clock();
-	}
-	trans->last_begin_time = now;
-
-	if (unlikely(trans->srcu_held &&
-		     time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
-		bch2_trans_srcu_unlock(trans);
-
-	trans->last_begin_ip = _RET_IP_;
-	trans->locked  = true;
-
-	if (trans->restarted) {
-		bch2_btree_path_traverse_all(trans);
-		trans->notrace_relock_fail = false;
-	}
-
-	bch2_trans_verify_not_unlocked(trans);
-	return trans->restart_count;
-}
-
-const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR] = { "(unknown)" };
-
-unsigned bch2_trans_get_fn_idx(const char *fn)
-{
-	for (unsigned i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
-		if (!bch2_btree_transaction_fns[i] ||
-		    bch2_btree_transaction_fns[i] == fn) {
-			bch2_btree_transaction_fns[i] = fn;
-			return i;
-		}
-
-	pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
-	return 0;
-}
-
-struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
-	__acquires(&c->btree_trans_barrier)
-{
-	struct btree_trans *trans;
-
-	if (IS_ENABLED(__KERNEL__)) {
-		trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
-		if (trans) {
-			memset(trans, 0, offsetof(struct btree_trans, list));
-			goto got_trans;
-		}
-	}
-
-	trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
-	memset(trans, 0, sizeof(*trans));
-	closure_init_stack(&trans->ref);
-
-	seqmutex_lock(&c->btree_trans_lock);
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
-		struct btree_trans *pos;
-		pid_t pid = current->pid;
-
-		trans->locking_wait.task = current;
-
-		list_for_each_entry(pos, &c->btree_trans_list, list) {
-			struct task_struct *pos_task = READ_ONCE(pos->locking_wait.task);
-			/*
-			 * We'd much prefer to be stricter here and completely
-			 * disallow multiple btree_trans in the same thread -
-			 * but the data move path calls bch2_write when we
-			 * already have a btree_trans initialized.
-			 */
-			BUG_ON(pos_task &&
-			       pid == pos_task->pid &&
-			       pos->locked);
-
-			if (pos_task && pid < pos_task->pid) {
-				list_add_tail(&trans->list, &pos->list);
-				goto list_add_done;
-			}
-		}
-	}
-	list_add_tail(&trans->list, &c->btree_trans_list);
-list_add_done:
-	seqmutex_unlock(&c->btree_trans_lock);
-got_trans:
-	trans->c		= c;
-	trans->last_begin_time	= local_clock();
-	trans->fn_idx		= fn_idx;
-	trans->locking_wait.task = current;
-	trans->locked		= true;
-	trans->journal_replay_not_finished =
-		unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)) &&
-		atomic_inc_not_zero(&c->journal_keys.ref);
-	trans->nr_paths		= ARRAY_SIZE(trans->_paths);
-	trans->paths_allocated	= trans->_paths_allocated;
-	trans->sorted		= trans->_sorted;
-	trans->paths		= trans->_paths;
-	trans->updates		= trans->_updates;
-
-	*trans_paths_nr(trans->paths) = BTREE_ITER_INITIAL;
-
-	trans->paths_allocated[0] = 1;
-
-	if (fn_idx < BCH_TRANSACTIONS_NR) {
-		trans->fn = bch2_btree_transaction_fns[fn_idx];
-
-		struct btree_transaction_stats *s = &c->btree_transaction_stats[fn_idx];
-
-		if (s->max_mem) {
-			unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
-
-			trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
-			if (likely(trans->mem))
-				trans->mem_bytes = expected_mem_bytes;
-		}
-
-		trans->nr_paths_max = s->nr_max_paths;
-		trans->journal_entries_size = s->journal_entries_size;
-	}
-
-	trans->srcu_idx		= srcu_read_lock(&c->btree_trans_barrier);
-	trans->srcu_lock_time	= jiffies;
-	trans->srcu_held	= true;
-	return trans;
-}
-
-static void check_btree_paths_leaked(struct btree_trans *trans)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-	struct bch_fs *c = trans->c;
-	struct btree_path *path;
-	unsigned i;
-
-	trans_for_each_path(trans, path, i)
-		if (path->ref)
-			goto leaked;
-	return;
-leaked:
-	bch_err(c, "btree paths leaked from %s!", trans->fn);
-	trans_for_each_path(trans, path, i)
-		if (path->ref)
-			printk(KERN_ERR "  btree %s %pS\n",
-			       bch2_btree_id_str(path->btree_id),
-			       (void *) path->ip_allocated);
-	/* Be noisy about this: */
-	bch2_fatal_error(c);
-#endif
-}
-
-void bch2_trans_put(struct btree_trans *trans)
-	__releases(&c->btree_trans_barrier)
-{
-	struct bch_fs *c = trans->c;
-
-	bch2_trans_unlock(trans);
-
-	trans_for_each_update(trans, i)
-		__btree_path_put(trans->paths + i->path, true);
-	trans->nr_updates	= 0;
-	trans->locking_wait.task = NULL;
-
-	check_btree_paths_leaked(trans);
-
-	if (trans->srcu_held) {
-		check_srcu_held_too_long(trans);
-		srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
-	}
-
-	if (trans->fs_usage_deltas) {
-		if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
-		    REPLICAS_DELTA_LIST_MAX)
-			mempool_free(trans->fs_usage_deltas,
-				     &c->replicas_delta_pool);
-		else
-			kfree(trans->fs_usage_deltas);
-	}
-
-	if (unlikely(trans->journal_replay_not_finished))
-		bch2_journal_keys_put(c);
-
-	unsigned long *paths_allocated = trans->paths_allocated;
-	trans->paths_allocated	= NULL;
-	trans->paths		= NULL;
-
-	if (paths_allocated != trans->_paths_allocated)
-		kvfree_rcu_mightsleep(paths_allocated);
-
-	if (trans->used_mempool)
-		mempool_free(trans->mem, &c->btree_trans_mem_pool);
-	else
-		kfree(trans->mem);
-
-	/* Userspace doesn't have a real percpu implementation: */
-	if (IS_ENABLED(__KERNEL__))
-		trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
-
-	if (trans) {
-		closure_sync(&trans->ref);
-
-		seqmutex_lock(&c->btree_trans_lock);
-		list_del(&trans->list);
-		seqmutex_unlock(&c->btree_trans_lock);
-
-		mempool_free(trans, &c->btree_trans_pool);
-	}
-}
-
-static void __maybe_unused
-bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
-				      struct btree_bkey_cached_common *b)
-{
-	struct six_lock_count c = six_lock_counts(&b->lock);
-	struct task_struct *owner;
-	pid_t pid;
-
-	rcu_read_lock();
-	owner = READ_ONCE(b->lock.owner);
-	pid = owner ? owner->pid : 0;
-	rcu_read_unlock();
-
-	prt_printf(out, "\t%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
-		   b->level, bch2_btree_id_str(b->btree_id));
-	bch2_bpos_to_text(out, btree_node_pos(b));
-
-	prt_printf(out, "\t locks %u:%u:%u held by pid %u",
-		   c.n[0], c.n[1], c.n[2], pid);
-}
-
-void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
-{
-	struct btree_bkey_cached_common *b;
-	static char lock_types[] = { 'r', 'i', 'w' };
-	struct task_struct *task = READ_ONCE(trans->locking_wait.task);
-	unsigned l, idx;
-
-	/* before rcu_read_lock(): */
-	bch2_printbuf_make_room(out, 4096);
-
-	if (!out->nr_tabstops) {
-		printbuf_tabstop_push(out, 16);
-		printbuf_tabstop_push(out, 32);
-	}
-
-	prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn);
-
-	/* trans->paths is rcu protected vs. freeing */
-	rcu_read_lock();
-	out->atomic++;
-
-	struct btree_path *paths = rcu_dereference(trans->paths);
-	if (!paths)
-		goto out;
-
-	unsigned long *paths_allocated = trans_paths_allocated(paths);
-
-	trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), idx, 1) {
-		struct btree_path *path = paths + idx;
-		if (!path->nodes_locked)
-			continue;
-
-		prt_printf(out, "  path %u %c l=%u %s:",
-		       idx,
-		       path->cached ? 'c' : 'b',
-		       path->level,
-		       bch2_btree_id_str(path->btree_id));
-		bch2_bpos_to_text(out, path->pos);
-		prt_newline(out);
-
-		for (l = 0; l < BTREE_MAX_DEPTH; l++) {
-			if (btree_node_locked(path, l) &&
-			    !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
-				prt_printf(out, "    %c l=%u ",
-					   lock_types[btree_node_locked_type(path, l)], l);
-				bch2_btree_bkey_cached_common_to_text(out, b);
-				prt_newline(out);
-			}
-		}
-	}
-
-	b = READ_ONCE(trans->locking);
-	if (b) {
-		prt_printf(out, "  blocked for %lluus on\n",
-			   div_u64(local_clock() - trans->locking_wait.start_time, 1000));
-		prt_printf(out, "    %c", lock_types[trans->locking_wait.lock_want]);
-		bch2_btree_bkey_cached_common_to_text(out, b);
-		prt_newline(out);
-	}
-out:
-	--out->atomic;
-	rcu_read_unlock();
-}
-
-void bch2_fs_btree_iter_exit(struct bch_fs *c)
-{
-	struct btree_transaction_stats *s;
-	struct btree_trans *trans;
-	int cpu;
-
-	if (c->btree_trans_bufs)
-		for_each_possible_cpu(cpu) {
-			struct btree_trans *trans =
-				per_cpu_ptr(c->btree_trans_bufs, cpu)->trans;
-
-			if (trans) {
-				closure_sync(&trans->ref);
-
-				seqmutex_lock(&c->btree_trans_lock);
-				list_del(&trans->list);
-				seqmutex_unlock(&c->btree_trans_lock);
-			}
-			kfree(trans);
-		}
-	free_percpu(c->btree_trans_bufs);
-
-	trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
-	if (trans)
-		panic("%s leaked btree_trans\n", trans->fn);
-
-	for (s = c->btree_transaction_stats;
-	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
-	     s++) {
-		kfree(s->max_paths_text);
-		bch2_time_stats_exit(&s->lock_hold_times);
-	}
-
-	if (c->btree_trans_barrier_initialized)
-		cleanup_srcu_struct(&c->btree_trans_barrier);
-	mempool_exit(&c->btree_trans_mem_pool);
-	mempool_exit(&c->btree_trans_pool);
-}
-
-void bch2_fs_btree_iter_init_early(struct bch_fs *c)
-{
-	struct btree_transaction_stats *s;
-
-	for (s = c->btree_transaction_stats;
-	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
-	     s++) {
-		bch2_time_stats_init(&s->duration);
-		bch2_time_stats_init(&s->lock_hold_times);
-		mutex_init(&s->lock);
-	}
-
-	INIT_LIST_HEAD(&c->btree_trans_list);
-	seqmutex_init(&c->btree_trans_lock);
-}
-
-int bch2_fs_btree_iter_init(struct bch_fs *c)
-{
-	int ret;
-
-	c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf);
-	if (!c->btree_trans_bufs)
-		return -ENOMEM;
-
-	ret   = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1,
-					  sizeof(struct btree_trans)) ?:
-		mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
-					  BTREE_TRANS_MEM_MAX) ?:
-		init_srcu_struct(&c->btree_trans_barrier);
-	if (!ret)
-		c->btree_trans_barrier_initialized = true;
-	return ret;
-}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
deleted file mode 100644
index eab2a25bdc7a..000000000000
--- a/fs/bcachefs/btree_iter.h
+++ /dev/null
@@ -1,896 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_ITER_H
-#define _BCACHEFS_BTREE_ITER_H
-
-#include "bset.h"
-#include "btree_types.h"
-#include "trace.h"
-
-static inline int __bkey_err(const struct bkey *k)
-{
-	return PTR_ERR_OR_ZERO(k);
-}
-
-#define bkey_err(_k)	__bkey_err((_k).k)
-
-static inline void __btree_path_get(struct btree_path *path, bool intent)
-{
-	path->ref++;
-	path->intent_ref += intent;
-}
-
-static inline bool __btree_path_put(struct btree_path *path, bool intent)
-{
-	EBUG_ON(!path->ref);
-	EBUG_ON(!path->intent_ref && intent);
-	path->intent_ref -= intent;
-	return --path->ref == 0;
-}
-
-static inline void btree_path_set_dirty(struct btree_path *path,
-					enum btree_path_uptodate u)
-{
-	path->uptodate = max_t(unsigned, path->uptodate, u);
-}
-
-static inline struct btree *btree_path_node(struct btree_path *path,
-					    unsigned level)
-{
-	return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL;
-}
-
-static inline bool btree_node_lock_seq_matches(const struct btree_path *path,
-					const struct btree *b, unsigned level)
-{
-	return path->l[level].lock_seq == six_lock_seq(&b->c.lock);
-}
-
-static inline struct btree *btree_node_parent(struct btree_path *path,
-					      struct btree *b)
-{
-	return btree_path_node(path, b->c.level + 1);
-}
-
-/* Iterate over paths within a transaction: */
-
-void __bch2_btree_trans_sort_paths(struct btree_trans *);
-
-static inline void btree_trans_sort_paths(struct btree_trans *trans)
-{
-	if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
-	    trans->paths_sorted)
-		return;
-	__bch2_btree_trans_sort_paths(trans);
-}
-
-static inline unsigned long *trans_paths_nr(struct btree_path *paths)
-{
-	return &container_of(paths, struct btree_trans_paths, paths[0])->nr_paths;
-}
-
-static inline unsigned long *trans_paths_allocated(struct btree_path *paths)
-{
-	unsigned long *v = trans_paths_nr(paths);
-	return v - BITS_TO_LONGS(*v);
-}
-
-#define trans_for_each_path_idx_from(_paths_allocated, _nr, _idx, _start)\
-	for (_idx = _start;						\
-	     (_idx = find_next_bit(_paths_allocated, _nr, _idx)) < _nr;	\
-	     _idx++)
-
-static inline struct btree_path *
-__trans_next_path(struct btree_trans *trans, unsigned *idx)
-{
-	unsigned long *w = trans->paths_allocated + *idx / BITS_PER_LONG;
-	/*
-	 * Open coded find_next_bit(), because
-	 *  - this is fast path, we can't afford the function call
-	 *  - and we know that nr_paths is a multiple of BITS_PER_LONG,
-	 */
-	while (*idx < trans->nr_paths) {
-		unsigned long v = *w >> (*idx & (BITS_PER_LONG - 1));
-		if (v) {
-			*idx += __ffs(v);
-			return trans->paths + *idx;
-		}
-
-		*idx += BITS_PER_LONG;
-		*idx &= ~(BITS_PER_LONG - 1);
-		w++;
-	}
-
-	return NULL;
-}
-
-/*
- * This version is intended to be safe for use on a btree_trans that is owned by
- * another thread, for bch2_btree_trans_to_text();
- */
-#define trans_for_each_path_from(_trans, _path, _idx, _start)		\
-	for (_idx = _start;						\
-	     (_path = __trans_next_path((_trans), &_idx));		\
-	     _idx++)
-
-#define trans_for_each_path(_trans, _path, _idx)			\
-	trans_for_each_path_from(_trans, _path, _idx, 1)
-
-static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
-{
-	unsigned idx = path ? path->sorted_idx + 1 : 0;
-
-	EBUG_ON(idx > trans->nr_sorted);
-
-	return idx < trans->nr_sorted
-		? trans->paths + trans->sorted[idx]
-		: NULL;
-}
-
-static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path)
-{
-	unsigned idx = path ? path->sorted_idx : trans->nr_sorted;
-
-	return idx
-		? trans->paths + trans->sorted[idx - 1]
-		: NULL;
-}
-
-#define trans_for_each_path_idx_inorder(_trans, _iter)			\
-	for (_iter = (struct trans_for_each_path_inorder_iter) { 0 };	\
-	     (_iter.path_idx = trans->sorted[_iter.sorted_idx],		\
-	      _iter.sorted_idx < (_trans)->nr_sorted);			\
-	     _iter.sorted_idx++)
-
-struct trans_for_each_path_inorder_iter {
-	btree_path_idx_t	sorted_idx;
-	btree_path_idx_t	path_idx;
-};
-
-#define trans_for_each_path_inorder(_trans, _path, _iter)		\
-	for (_iter = (struct trans_for_each_path_inorder_iter) { 0 };	\
-	     (_iter.path_idx = trans->sorted[_iter.sorted_idx],		\
-	      _path = (_trans)->paths + _iter.path_idx,			\
-	      _iter.sorted_idx < (_trans)->nr_sorted);			\
-	     _iter.sorted_idx++)
-
-#define trans_for_each_path_inorder_reverse(_trans, _path, _i)		\
-	for (_i = trans->nr_sorted - 1;					\
-	     ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) >= 0;\
-	     --_i)
-
-static inline bool __path_has_node(const struct btree_path *path,
-				   const struct btree *b)
-{
-	return path->l[b->c.level].b == b &&
-		btree_node_lock_seq_matches(path, b, b->c.level);
-}
-
-static inline struct btree_path *
-__trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
-			    unsigned *idx)
-{
-	struct btree_path *path;
-
-	while ((path = __trans_next_path(trans, idx)) &&
-		!__path_has_node(path, b))
-	       (*idx)++;
-
-	return path;
-}
-
-#define trans_for_each_path_with_node(_trans, _b, _path, _iter)		\
-	for (_iter = 1;							\
-	     (_path = __trans_next_path_with_node((_trans), (_b), &_iter));\
-	     _iter++)
-
-btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *, btree_path_idx_t,
-					    bool, unsigned long);
-
-static inline btree_path_idx_t __must_check
-bch2_btree_path_make_mut(struct btree_trans *trans,
-			 btree_path_idx_t path, bool intent,
-			 unsigned long ip)
-{
-	if (trans->paths[path].ref > 1 ||
-	    trans->paths[path].preserve)
-		path = __bch2_btree_path_make_mut(trans, path, intent, ip);
-	trans->paths[path].should_be_locked = false;
-	return path;
-}
-
-btree_path_idx_t __must_check
-__bch2_btree_path_set_pos(struct btree_trans *, btree_path_idx_t,
-			  struct bpos, bool, unsigned long);
-
-static inline btree_path_idx_t __must_check
-bch2_btree_path_set_pos(struct btree_trans *trans,
-			btree_path_idx_t path, struct bpos new_pos,
-			bool intent, unsigned long ip)
-{
-	return !bpos_eq(new_pos, trans->paths[path].pos)
-		? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip)
-		: path;
-}
-
-int __must_check bch2_btree_path_traverse_one(struct btree_trans *,
-					      btree_path_idx_t,
-					      unsigned, unsigned long);
-
-static inline void bch2_trans_verify_not_unlocked(struct btree_trans *);
-
-static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
-					  btree_path_idx_t path, unsigned flags)
-{
-	bch2_trans_verify_not_unlocked(trans);
-
-	if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK)
-		return 0;
-
-	return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_);
-}
-
-btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
-				 unsigned, unsigned, unsigned, unsigned long);
-btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *, enum btree_id,
-					    unsigned, struct bpos);
-
-struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
-
-/*
- * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
- * different snapshot:
- */
-static inline struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
-{
-	struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
-
-	if (k.k && bpos_eq(path->pos, k.k->p))
-		return k;
-
-	bkey_init(u);
-	u->p = path->pos;
-	return (struct bkey_s_c) { u, NULL };
-}
-
-struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
-					struct btree_iter *, struct bpos);
-
-void bch2_btree_path_level_init(struct btree_trans *, struct btree_path *, struct btree *);
-
-int __bch2_trans_mutex_lock(struct btree_trans *, struct mutex *);
-
-static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex *lock)
-{
-	return mutex_trylock(lock)
-		? 0
-		: __bch2_trans_mutex_lock(trans, lock);
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_trans_verify_paths(struct btree_trans *);
-void bch2_assert_pos_locked(struct btree_trans *, enum btree_id,
-			    struct bpos, bool);
-#else
-static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
-static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
-					  struct bpos pos, bool key_cache) {}
-#endif
-
-void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
-				      struct btree *, struct bkey_packed *);
-void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *,
-			      struct btree *, struct btree_node_iter *,
-			      struct bkey_packed *, unsigned, unsigned);
-
-int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
-
-void bch2_path_put(struct btree_trans *, btree_path_idx_t, bool);
-
-int bch2_trans_relock(struct btree_trans *);
-int bch2_trans_relock_notrace(struct btree_trans *);
-void bch2_trans_unlock(struct btree_trans *);
-void bch2_trans_unlock_long(struct btree_trans *);
-
-static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count)
-{
-	return restart_count != trans->restart_count
-		? -BCH_ERR_transaction_restart_nested
-		: 0;
-}
-
-void __noreturn bch2_trans_restart_error(struct btree_trans *, u32);
-
-static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans,
-						   u32 restart_count)
-{
-	if (trans_was_restarted(trans, restart_count))
-		bch2_trans_restart_error(trans, restart_count);
-}
-
-void __noreturn bch2_trans_in_restart_error(struct btree_trans *);
-
-static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans)
-{
-	if (trans->restarted)
-		bch2_trans_in_restart_error(trans);
-}
-
-void __noreturn bch2_trans_unlocked_error(struct btree_trans *);
-
-static inline void bch2_trans_verify_not_unlocked(struct btree_trans *trans)
-{
-	if (!trans->locked)
-		bch2_trans_unlocked_error(trans);
-}
-
-__always_inline
-static int btree_trans_restart_nounlock(struct btree_trans *trans, int err)
-{
-	BUG_ON(err <= 0);
-	BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart));
-
-	trans->restarted = err;
-	trans->last_restarted_ip = _THIS_IP_;
-	return -err;
-}
-
-__always_inline
-static int btree_trans_restart(struct btree_trans *trans, int err)
-{
-	btree_trans_restart_nounlock(trans, err);
-	return -err;
-}
-
-bool bch2_btree_node_upgrade(struct btree_trans *,
-			     struct btree_path *, unsigned);
-
-void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned);
-
-static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
-					     struct btree_path *path)
-{
-	unsigned new_locks_want = path->level + !!path->intent_ref;
-
-	if (path->locks_want > new_locks_want)
-		__bch2_btree_path_downgrade(trans, path, new_locks_want);
-}
-
-void bch2_trans_downgrade(struct btree_trans *);
-
-void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *);
-void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
-
-int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
-int __must_check bch2_btree_iter_traverse(struct btree_iter *);
-
-struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
-struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *);
-struct btree *bch2_btree_iter_next_node(struct btree_iter *);
-
-struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos);
-struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
-
-static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
-{
-	return bch2_btree_iter_peek_upto(iter, SPOS_MAX);
-}
-
-struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
-
-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
-
-bool bch2_btree_iter_advance(struct btree_iter *);
-bool bch2_btree_iter_rewind(struct btree_iter *);
-
-static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
-{
-	iter->k.type = KEY_TYPE_deleted;
-	iter->k.p.inode		= iter->pos.inode	= new_pos.inode;
-	iter->k.p.offset	= iter->pos.offset	= new_pos.offset;
-	iter->k.p.snapshot	= iter->pos.snapshot	= new_pos.snapshot;
-	iter->k.size = 0;
-}
-
-static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
-{
-	struct btree_trans *trans = iter->trans;
-
-	if (unlikely(iter->update_path))
-		bch2_path_put(trans, iter->update_path,
-			      iter->flags & BTREE_ITER_intent);
-	iter->update_path = 0;
-
-	if (!(iter->flags & BTREE_ITER_all_snapshots))
-		new_pos.snapshot = iter->snapshot;
-
-	__bch2_btree_iter_set_pos(iter, new_pos);
-}
-
-static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
-{
-	BUG_ON(!(iter->flags & BTREE_ITER_is_extents));
-	iter->pos = bkey_start_pos(&iter->k);
-}
-
-static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot)
-{
-	struct bpos pos = iter->pos;
-
-	iter->snapshot = snapshot;
-	pos.snapshot = snapshot;
-	bch2_btree_iter_set_pos(iter, pos);
-}
-
-void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
-
-static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
-					       unsigned btree_id,
-					       unsigned flags)
-{
-	if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) &&
-	    btree_id_is_extents(btree_id))
-		flags |= BTREE_ITER_is_extents;
-
-	if (!(flags & BTREE_ITER_snapshot_field) &&
-	    !btree_type_has_snapshot_field(btree_id))
-		flags &= ~BTREE_ITER_all_snapshots;
-
-	if (!(flags & BTREE_ITER_all_snapshots) &&
-	    btree_type_has_snapshots(btree_id))
-		flags |= BTREE_ITER_filter_snapshots;
-
-	if (trans->journal_replay_not_finished)
-		flags |= BTREE_ITER_with_journal;
-
-	return flags;
-}
-
-static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans,
-					     unsigned btree_id,
-					     unsigned flags)
-{
-	if (!btree_id_cached(trans->c, btree_id)) {
-		flags &= ~BTREE_ITER_cached;
-		flags &= ~BTREE_ITER_with_key_cache;
-	} else if (!(flags & BTREE_ITER_cached))
-		flags |= BTREE_ITER_with_key_cache;
-
-	return __bch2_btree_iter_flags(trans, btree_id, flags);
-}
-
-static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
-					  struct btree_iter *iter,
-					  unsigned btree_id, struct bpos pos,
-					  unsigned locks_want,
-					  unsigned depth,
-					  unsigned flags,
-					  unsigned long ip)
-{
-	iter->trans		= trans;
-	iter->update_path	= 0;
-	iter->key_cache_path	= 0;
-	iter->btree_id		= btree_id;
-	iter->min_depth		= 0;
-	iter->flags		= flags;
-	iter->snapshot		= pos.snapshot;
-	iter->pos		= pos;
-	iter->k			= POS_KEY(pos);
-	iter->journal_idx	= 0;
-#ifdef CONFIG_BCACHEFS_DEBUG
-	iter->ip_allocated = ip;
-#endif
-	iter->path = bch2_path_get(trans, btree_id, iter->pos,
-				   locks_want, depth, flags, ip);
-}
-
-void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *,
-			  enum btree_id, struct bpos, unsigned);
-
-static inline void bch2_trans_iter_init(struct btree_trans *trans,
-			  struct btree_iter *iter,
-			  unsigned btree_id, struct bpos pos,
-			  unsigned flags)
-{
-	if (__builtin_constant_p(btree_id) &&
-	    __builtin_constant_p(flags))
-		bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
-				bch2_btree_iter_flags(trans, btree_id, flags),
-				_THIS_IP_);
-	else
-		bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags);
-}
-
-void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
-			       enum btree_id, struct bpos,
-			       unsigned, unsigned, unsigned);
-void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
-
-void bch2_set_btree_iter_dontneed(struct btree_iter *);
-
-void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
-
-static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
-{
-	size = roundup(size, 8);
-
-	if (likely(trans->mem_top + size <= trans->mem_bytes)) {
-		void *p = trans->mem + trans->mem_top;
-
-		trans->mem_top += size;
-		memset(p, 0, size);
-		return p;
-	} else {
-		return __bch2_trans_kmalloc(trans, size);
-	}
-}
-
-static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size)
-{
-	size = round_up(size, 8);
-
-	if (likely(trans->mem_top + size <= trans->mem_bytes)) {
-		void *p = trans->mem + trans->mem_top;
-
-		trans->mem_top += size;
-		return p;
-	} else {
-		return __bch2_trans_kmalloc(trans, size);
-	}
-}
-
-static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans,
-				struct btree_iter *iter,
-				unsigned btree_id, struct bpos pos,
-				unsigned flags, unsigned type)
-{
-	struct bkey_s_c k;
-
-	bch2_trans_iter_init(trans, iter, btree_id, pos, flags);
-	k = bch2_btree_iter_peek_slot(iter);
-
-	if (!bkey_err(k) && type && k.k->type != type)
-		k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch);
-	if (unlikely(bkey_err(k)))
-		bch2_trans_iter_exit(trans, iter);
-	return k;
-}
-
-static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans,
-				struct btree_iter *iter,
-				unsigned btree_id, struct bpos pos,
-				unsigned flags)
-{
-	return __bch2_bkey_get_iter(trans, iter, btree_id, pos, flags, 0);
-}
-
-#define bch2_bkey_get_iter_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\
-	bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter,			\
-				       _btree_id, _pos, _flags, KEY_TYPE_##_type))
-
-static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
-				unsigned btree_id, struct bpos pos,
-				unsigned flags, unsigned type,
-				unsigned val_size, void *val)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type);
-	ret = bkey_err(k);
-	if (!ret) {
-		unsigned b = min_t(unsigned, bkey_val_bytes(k.k), val_size);
-
-		memcpy(val, k.v, b);
-		if (unlikely(b < sizeof(*val)))
-			memset((void *) val + b, 0, sizeof(*val) - b);
-		bch2_trans_iter_exit(trans, &iter);
-	}
-
-	return ret;
-}
-
-#define bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, _type, _val)\
-	__bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags,	\
-				  KEY_TYPE_##_type, sizeof(*_val), _val)
-
-void bch2_trans_srcu_unlock(struct btree_trans *);
-
-u32 bch2_trans_begin(struct btree_trans *);
-
-/*
- * XXX
- * this does not handle transaction restarts from bch2_btree_iter_next_node()
- * correctly
- */
-#define __for_each_btree_node(_trans, _iter, _btree_id, _start,		\
-			      _locks_want, _depth, _flags, _b, _ret)	\
-	for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id),	\
-				_start, _locks_want, _depth, _flags);	\
-	     (_b) = bch2_btree_iter_peek_node_and_restart(&(_iter)),	\
-	     !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b);			\
-	     (_b) = bch2_btree_iter_next_node(&(_iter)))
-
-#define for_each_btree_node(_trans, _iter, _btree_id, _start,		\
-			    _flags, _b, _ret)				\
-	__for_each_btree_node(_trans, _iter, _btree_id, _start,		\
-			      0, 0, _flags, _b, _ret)
-
-static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
-							     unsigned flags)
-{
-	return  flags & BTREE_ITER_slots      ? bch2_btree_iter_peek_slot(iter) :
-						bch2_btree_iter_peek_prev(iter);
-}
-
-static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
-							unsigned flags)
-{
-	return  flags & BTREE_ITER_slots      ? bch2_btree_iter_peek_slot(iter) :
-						bch2_btree_iter_peek(iter);
-}
-
-static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter,
-							     struct bpos end,
-							     unsigned flags)
-{
-	if (!(flags & BTREE_ITER_slots))
-		return bch2_btree_iter_peek_upto(iter, end);
-
-	if (bkey_gt(iter->pos, end))
-		return bkey_s_c_null;
-
-	return bch2_btree_iter_peek_slot(iter);
-}
-
-int __bch2_btree_trans_too_many_iters(struct btree_trans *);
-
-static inline int btree_trans_too_many_iters(struct btree_trans *trans)
-{
-	if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_NORMAL_LIMIT - 8)
-		return __bch2_btree_trans_too_many_iters(trans);
-
-	return 0;
-}
-
-/*
- * goto instead of loop, so that when used inside for_each_btree_key2()
- * break/continue work correctly
- */
-#define lockrestart_do(_trans, _do)					\
-({									\
-	__label__ transaction_restart;					\
-	u32 _restart_count;						\
-	int _ret2;							\
-transaction_restart:							\
-	_restart_count = bch2_trans_begin(_trans);			\
-	_ret2 = (_do);							\
-									\
-	if (bch2_err_matches(_ret2, BCH_ERR_transaction_restart))	\
-		goto transaction_restart;				\
-									\
-	if (!_ret2)							\
-		bch2_trans_verify_not_restarted(_trans, _restart_count);\
-	_ret2;								\
-})
-
-/*
- * nested_lockrestart_do(), nested_commit_do():
- *
- * These are like lockrestart_do() and commit_do(), with two differences:
- *
- *  - We don't call bch2_trans_begin() unless we had a transaction restart
- *  - We return -BCH_ERR_transaction_restart_nested if we succeeded after a
- *  transaction restart
- */
-#define nested_lockrestart_do(_trans, _do)				\
-({									\
-	u32 _restart_count, _orig_restart_count;			\
-	int _ret2;							\
-									\
-	_restart_count = _orig_restart_count = (_trans)->restart_count;	\
-									\
-	while (bch2_err_matches(_ret2 = (_do), BCH_ERR_transaction_restart))\
-		_restart_count = bch2_trans_begin(_trans);		\
-									\
-	if (!_ret2)							\
-		bch2_trans_verify_not_restarted(_trans, _restart_count);\
-									\
-	_ret2 ?: trans_was_restarted(_trans, _restart_count);		\
-})
-
-#define for_each_btree_key_upto_continue(_trans, _iter,			\
-					 _end, _flags, _k, _do)		\
-({									\
-	struct bkey_s_c _k;						\
-	int _ret3 = 0;							\
-									\
-	do {								\
-		_ret3 = lockrestart_do(_trans, ({			\
-			(_k) = bch2_btree_iter_peek_upto_type(&(_iter),	\
-						_end, (_flags));	\
-			if (!(_k).k)					\
-				break;					\
-									\
-			bkey_err(_k) ?: (_do);				\
-		}));							\
-	} while (!_ret3 && bch2_btree_iter_advance(&(_iter)));		\
-									\
-	bch2_trans_iter_exit((_trans), &(_iter));			\
-	_ret3;								\
-})
-
-#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do)	\
-	for_each_btree_key_upto_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do)
-
-#define for_each_btree_key_upto(_trans, _iter, _btree_id,		\
-				_start, _end, _flags, _k, _do)		\
-({									\
-	bch2_trans_begin(trans);					\
-									\
-	struct btree_iter _iter;					\
-	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
-			     (_start), (_flags));			\
-									\
-	for_each_btree_key_upto_continue(_trans, _iter, _end, _flags, _k, _do);\
-})
-
-#define for_each_btree_key(_trans, _iter, _btree_id,			\
-			   _start, _flags, _k, _do)			\
-	for_each_btree_key_upto(_trans, _iter, _btree_id, _start,	\
-				 SPOS_MAX, _flags, _k, _do)
-
-#define for_each_btree_key_reverse(_trans, _iter, _btree_id,		\
-				   _start, _flags, _k, _do)		\
-({									\
-	struct btree_iter _iter;					\
-	struct bkey_s_c _k;						\
-	int _ret3 = 0;							\
-									\
-	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
-			     (_start), (_flags));			\
-									\
-	do {								\
-		_ret3 = lockrestart_do(_trans, ({			\
-			(_k) = bch2_btree_iter_peek_prev_type(&(_iter),	\
-							(_flags));	\
-			if (!(_k).k)					\
-				break;					\
-									\
-			bkey_err(_k) ?: (_do);				\
-		}));							\
-	} while (!_ret3 && bch2_btree_iter_rewind(&(_iter)));		\
-									\
-	bch2_trans_iter_exit((_trans), &(_iter));			\
-	_ret3;								\
-})
-
-#define for_each_btree_key_commit(_trans, _iter, _btree_id,		\
-				  _start, _iter_flags, _k,		\
-				  _disk_res, _journal_seq, _commit_flags,\
-				  _do)					\
-	for_each_btree_key(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
-			    (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
-					(_journal_seq), (_commit_flags)))
-
-#define for_each_btree_key_reverse_commit(_trans, _iter, _btree_id,	\
-				  _start, _iter_flags, _k,		\
-				  _disk_res, _journal_seq, _commit_flags,\
-				  _do)					\
-	for_each_btree_key_reverse(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
-			    (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
-					(_journal_seq), (_commit_flags)))
-
-#define for_each_btree_key_upto_commit(_trans, _iter, _btree_id,	\
-				  _start, _end, _iter_flags, _k,	\
-				  _disk_res, _journal_seq, _commit_flags,\
-				  _do)					\
-	for_each_btree_key_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
-			    (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
-					(_journal_seq), (_commit_flags)))
-
-struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
-
-static inline struct bkey_s_c
-__bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
-				   struct btree_iter *iter, unsigned flags)
-{
-	struct bkey_s_c k;
-
-	while (btree_trans_too_many_iters(trans) ||
-	       (k = bch2_btree_iter_peek_type(iter, flags),
-		bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
-		bch2_trans_begin(trans);
-
-	return k;
-}
-
-#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id,	\
-			   _start, _end, _flags, _k, _ret)		\
-	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
-				  (_start), (_flags));			\
-	     (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\
-	     !((_ret) = bkey_err(_k)) && (_k).k;			\
-	     bch2_btree_iter_advance(&(_iter)))
-
-#define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\
-	for (;									\
-	     (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),	\
-	     !((_ret) = bkey_err(_k)) && (_k).k;				\
-	     bch2_btree_iter_advance(&(_iter)))
-
-#define for_each_btree_key_norestart(_trans, _iter, _btree_id,		\
-			   _start, _flags, _k, _ret)			\
-	for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\
-					  SPOS_MAX, _flags, _k, _ret)
-
-#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret)	\
-	for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
-
-/*
- * This should not be used in a fastpath, without first trying _do in
- * nonblocking mode - it will cause excessive transaction restarts and
- * potentially livelocking:
- */
-#define drop_locks_do(_trans, _do)					\
-({									\
-	bch2_trans_unlock(_trans);					\
-	_do ?: bch2_trans_relock(_trans);				\
-})
-
-#define allocate_dropping_locks_errcode(_trans, _do)			\
-({									\
-	gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN;				\
-	int _ret = _do;							\
-									\
-	if (bch2_err_matches(_ret, ENOMEM)) {				\
-		_gfp = GFP_KERNEL;					\
-		_ret = drop_locks_do(trans, _do);			\
-	}								\
-	_ret;								\
-})
-
-#define allocate_dropping_locks(_trans, _ret, _do)			\
-({									\
-	gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN;				\
-	typeof(_do) _p = _do;						\
-									\
-	_ret = 0;							\
-	if (unlikely(!_p)) {						\
-		_gfp = GFP_KERNEL;					\
-		_ret = drop_locks_do(trans, ((_p = _do), 0));		\
-	}								\
-	_p;								\
-})
-
-void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
-void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
-void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
-void bch2_dump_trans_updates(struct btree_trans *);
-void bch2_dump_trans_paths_updates(struct btree_trans *);
-
-struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
-void bch2_trans_put(struct btree_trans *);
-
-extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
-unsigned bch2_trans_get_fn_idx(const char *);
-
-#define bch2_trans_get(_c)						\
-({									\
-	static unsigned trans_fn_idx;					\
-									\
-	if (unlikely(!trans_fn_idx))					\
-		trans_fn_idx = bch2_trans_get_fn_idx(__func__);		\
-	__bch2_trans_get(_c, trans_fn_idx);				\
-})
-
-void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
-
-void bch2_fs_btree_iter_exit(struct bch_fs *);
-void bch2_fs_btree_iter_init_early(struct bch_fs *);
-int bch2_fs_btree_iter_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_ITER_H */
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
deleted file mode 100644
index 332dbf164929..000000000000
--- a/fs/bcachefs/btree_journal_iter.c
+++ /dev/null
@@ -1,642 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "bset.h"
-#include "btree_cache.h"
-#include "btree_journal_iter.h"
-#include "journal_io.h"
-
-#include <linux/sort.h>
-
-/*
- * For managing keys we read from the journal: until journal replay works normal
- * btree lookups need to be able to find and return keys from the journal where
- * they overwrite what's in the btree, so we have a special iterator and
- * operations for the regular btree iter code to use:
- */
-
-static int __journal_key_cmp(enum btree_id	l_btree_id,
-			     unsigned		l_level,
-			     struct bpos	l_pos,
-			     const struct journal_key *r)
-{
-	return (cmp_int(l_btree_id,	r->btree_id) ?:
-		cmp_int(l_level,	r->level) ?:
-		bpos_cmp(l_pos,	r->k->k.p));
-}
-
-static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
-{
-	return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
-}
-
-static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
-{
-	size_t gap_size = keys->size - keys->nr;
-
-	if (idx >= keys->gap)
-		idx += gap_size;
-	return idx;
-}
-
-static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
-{
-	return keys->data + idx_to_pos(keys, idx);
-}
-
-static size_t __bch2_journal_key_search(struct journal_keys *keys,
-					enum btree_id id, unsigned level,
-					struct bpos pos)
-{
-	size_t l = 0, r = keys->nr, m;
-
-	while (l < r) {
-		m = l + ((r - l) >> 1);
-		if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
-			l = m + 1;
-		else
-			r = m;
-	}
-
-	BUG_ON(l < keys->nr &&
-	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
-
-	BUG_ON(l &&
-	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
-
-	return l;
-}
-
-static size_t bch2_journal_key_search(struct journal_keys *keys,
-				      enum btree_id id, unsigned level,
-				      struct bpos pos)
-{
-	return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
-}
-
-/* Returns first non-overwritten key >= search key: */
-struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
-					   unsigned level, struct bpos pos,
-					   struct bpos end_pos, size_t *idx)
-{
-	struct journal_keys *keys = &c->journal_keys;
-	unsigned iters = 0;
-	struct journal_key *k;
-
-	BUG_ON(*idx > keys->nr);
-search:
-	if (!*idx)
-		*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
-
-	while (*idx &&
-	       __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
-		--(*idx);
-		iters++;
-		if (iters == 10) {
-			*idx = 0;
-			goto search;
-		}
-	}
-
-	while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
-		if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
-			return NULL;
-
-		if (k->overwritten) {
-			(*idx)++;
-			continue;
-		}
-
-		if (__journal_key_cmp(btree_id, level, pos, k) <= 0)
-			return k->k;
-
-		(*idx)++;
-		iters++;
-		if (iters == 10) {
-			*idx = 0;
-			goto search;
-		}
-	}
-
-	return NULL;
-}
-
-struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
-					   unsigned level, struct bpos pos)
-{
-	size_t idx = 0;
-
-	return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
-}
-
-static void journal_iter_verify(struct journal_iter *iter)
-{
-	struct journal_keys *keys = iter->keys;
-	size_t gap_size = keys->size - keys->nr;
-
-	BUG_ON(iter->idx >= keys->gap &&
-	       iter->idx <  keys->gap + gap_size);
-
-	if (iter->idx < keys->size) {
-		struct journal_key *k = keys->data + iter->idx;
-
-		int cmp = cmp_int(k->btree_id,	iter->btree_id) ?:
-			  cmp_int(k->level,	iter->level);
-		BUG_ON(cmp < 0);
-	}
-}
-
-static void journal_iters_fix(struct bch_fs *c)
-{
-	struct journal_keys *keys = &c->journal_keys;
-	/* The key we just inserted is immediately before the gap: */
-	size_t gap_end = keys->gap + (keys->size - keys->nr);
-	struct journal_key *new_key = &keys->data[keys->gap - 1];
-	struct journal_iter *iter;
-
-	/*
-	 * If an iterator points one after the key we just inserted, decrement
-	 * the iterator so it points at the key we just inserted - if the
-	 * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
-	 * handle that:
-	 */
-	list_for_each_entry(iter, &c->journal_iters, list) {
-		journal_iter_verify(iter);
-		if (iter->idx		== gap_end &&
-		    new_key->btree_id	== iter->btree_id &&
-		    new_key->level	== iter->level)
-			iter->idx = keys->gap - 1;
-		journal_iter_verify(iter);
-	}
-}
-
-static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
-{
-	struct journal_keys *keys = &c->journal_keys;
-	struct journal_iter *iter;
-	size_t gap_size = keys->size - keys->nr;
-
-	list_for_each_entry(iter, &c->journal_iters, list) {
-		if (iter->idx > old_gap)
-			iter->idx -= gap_size;
-		if (iter->idx >= new_gap)
-			iter->idx += gap_size;
-	}
-}
-
-int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
-				 unsigned level, struct bkey_i *k)
-{
-	struct journal_key n = {
-		.btree_id	= id,
-		.level		= level,
-		.k		= k,
-		.allocated	= true,
-		/*
-		 * Ensure these keys are done last by journal replay, to unblock
-		 * journal reclaim:
-		 */
-		.journal_seq	= U32_MAX,
-	};
-	struct journal_keys *keys = &c->journal_keys;
-	size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
-
-	BUG_ON(test_bit(BCH_FS_rw, &c->flags));
-
-	if (idx < keys->size &&
-	    journal_key_cmp(&n, &keys->data[idx]) == 0) {
-		if (keys->data[idx].allocated)
-			kfree(keys->data[idx].k);
-		keys->data[idx] = n;
-		return 0;
-	}
-
-	if (idx > keys->gap)
-		idx -= keys->size - keys->nr;
-
-	size_t old_gap = keys->gap;
-
-	if (keys->nr == keys->size) {
-		journal_iters_move_gap(c, old_gap, keys->size);
-		old_gap = keys->size;
-
-		struct journal_keys new_keys = {
-			.nr			= keys->nr,
-			.size			= max_t(size_t, keys->size, 8) * 2,
-		};
-
-		new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL);
-		if (!new_keys.data) {
-			bch_err(c, "%s: error allocating new key array (size %zu)",
-				__func__, new_keys.size);
-			return -BCH_ERR_ENOMEM_journal_key_insert;
-		}
-
-		/* Since @keys was full, there was no gap: */
-		memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr);
-		kvfree(keys->data);
-		keys->data	= new_keys.data;
-		keys->nr	= new_keys.nr;
-		keys->size	= new_keys.size;
-
-		/* And now the gap is at the end: */
-		keys->gap	= keys->nr;
-	}
-
-	journal_iters_move_gap(c, old_gap, idx);
-
-	move_gap(keys, idx);
-
-	keys->nr++;
-	keys->data[keys->gap++] = n;
-
-	journal_iters_fix(c);
-
-	return 0;
-}
-
-/*
- * Can only be used from the recovery thread while we're still RO - can't be
- * used once we've got RW, as journal_keys is at that point used by multiple
- * threads:
- */
-int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
-			    unsigned level, struct bkey_i *k)
-{
-	struct bkey_i *n;
-	int ret;
-
-	n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
-	if (!n)
-		return -BCH_ERR_ENOMEM_journal_key_insert;
-
-	bkey_copy(n, k);
-	ret = bch2_journal_key_insert_take(c, id, level, n);
-	if (ret)
-		kfree(n);
-	return ret;
-}
-
-int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
-			    unsigned level, struct bpos pos)
-{
-	struct bkey_i whiteout;
-
-	bkey_init(&whiteout.k);
-	whiteout.k.p = pos;
-
-	return bch2_journal_key_insert(c, id, level, &whiteout);
-}
-
-bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree,
-				 unsigned level, struct bpos pos)
-{
-	struct journal_keys *keys = &trans->c->journal_keys;
-	size_t idx = bch2_journal_key_search(keys, btree, level, pos);
-
-	if (!trans->journal_replay_not_finished)
-		return false;
-
-	return (idx < keys->size &&
-		keys->data[idx].btree_id	== btree &&
-		keys->data[idx].level		== level &&
-		bpos_eq(keys->data[idx].k->k.p, pos) &&
-		bkey_deleted(&keys->data[idx].k->k));
-}
-
-void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
-				  unsigned level, struct bpos pos)
-{
-	struct journal_keys *keys = &c->journal_keys;
-	size_t idx = bch2_journal_key_search(keys, btree, level, pos);
-
-	if (idx < keys->size &&
-	    keys->data[idx].btree_id	== btree &&
-	    keys->data[idx].level	== level &&
-	    bpos_eq(keys->data[idx].k->k.p, pos))
-		keys->data[idx].overwritten = true;
-}
-
-static void bch2_journal_iter_advance(struct journal_iter *iter)
-{
-	if (iter->idx < iter->keys->size) {
-		iter->idx++;
-		if (iter->idx == iter->keys->gap)
-			iter->idx += iter->keys->size - iter->keys->nr;
-	}
-}
-
-static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
-{
-	journal_iter_verify(iter);
-
-	while (iter->idx < iter->keys->size) {
-		struct journal_key *k = iter->keys->data + iter->idx;
-
-		int cmp = cmp_int(k->btree_id,	iter->btree_id) ?:
-			  cmp_int(k->level,	iter->level);
-		if (cmp > 0)
-			break;
-		BUG_ON(cmp);
-
-		if (!k->overwritten)
-			return bkey_i_to_s_c(k->k);
-
-		bch2_journal_iter_advance(iter);
-	}
-
-	return bkey_s_c_null;
-}
-
-static void bch2_journal_iter_exit(struct journal_iter *iter)
-{
-	list_del(&iter->list);
-}
-
-static void bch2_journal_iter_init(struct bch_fs *c,
-				   struct journal_iter *iter,
-				   enum btree_id id, unsigned level,
-				   struct bpos pos)
-{
-	iter->btree_id	= id;
-	iter->level	= level;
-	iter->keys	= &c->journal_keys;
-	iter->idx	= bch2_journal_key_search(&c->journal_keys, id, level, pos);
-
-	journal_iter_verify(iter);
-}
-
-static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
-{
-	return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
-						iter->b, &iter->unpacked);
-}
-
-static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
-{
-	bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
-}
-
-void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
-{
-	if (bpos_eq(iter->pos, SPOS_MAX))
-		iter->at_end = true;
-	else
-		iter->pos = bpos_successor(iter->pos);
-}
-
-static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter)
-{
-	struct btree_and_journal_iter iter = *_iter;
-	struct bch_fs *c = iter.trans->c;
-	unsigned level = iter.journal.level;
-	struct bkey_buf tmp;
-	unsigned nr = test_bit(BCH_FS_started, &c->flags)
-		? (level > 1 ? 0 :  2)
-		: (level > 1 ? 1 : 16);
-
-	iter.prefetch = false;
-	bch2_bkey_buf_init(&tmp);
-
-	while (nr--) {
-		bch2_btree_and_journal_iter_advance(&iter);
-		struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter);
-		if (!k.k)
-			break;
-
-		bch2_bkey_buf_reassemble(&tmp, c, k);
-		bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1);
-	}
-
-	bch2_bkey_buf_exit(&tmp, c);
-}
-
-struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
-{
-	struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret;
-
-	if (iter->prefetch && iter->journal.level)
-		btree_and_journal_iter_prefetch(iter);
-again:
-	if (iter->at_end)
-		return bkey_s_c_null;
-
-	while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
-	       bpos_lt(btree_k.k->p, iter->pos))
-		bch2_journal_iter_advance_btree(iter);
-
-	if (iter->trans->journal_replay_not_finished)
-		while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
-		       bpos_lt(journal_k.k->p, iter->pos))
-			bch2_journal_iter_advance(&iter->journal);
-
-	ret = journal_k.k &&
-		(!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
-		? journal_k
-		: btree_k;
-
-	if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
-		ret = bkey_s_c_null;
-
-	if (ret.k) {
-		iter->pos = ret.k->p;
-		if (bkey_deleted(ret.k)) {
-			bch2_btree_and_journal_iter_advance(iter);
-			goto again;
-		}
-	} else {
-		iter->pos = SPOS_MAX;
-		iter->at_end = true;
-	}
-
-	return ret;
-}
-
-void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
-{
-	bch2_journal_iter_exit(&iter->journal);
-}
-
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
-						  struct btree_and_journal_iter *iter,
-						  struct btree *b,
-						  struct btree_node_iter node_iter,
-						  struct bpos pos)
-{
-	memset(iter, 0, sizeof(*iter));
-
-	iter->trans = trans;
-	iter->b = b;
-	iter->node_iter = node_iter;
-	iter->pos = b->data->min_key;
-	iter->at_end = false;
-	INIT_LIST_HEAD(&iter->journal.list);
-
-	if (trans->journal_replay_not_finished) {
-		bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
-		if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags))
-			list_add(&iter->journal.list, &trans->c->journal_iters);
-	}
-}
-
-/*
- * this version is used by btree_gc before filesystem has gone RW and
- * multithreaded, so uses the journal_iters list:
- */
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
-						struct btree_and_journal_iter *iter,
-						struct btree *b)
-{
-	struct btree_node_iter node_iter;
-
-	bch2_btree_node_iter_init_from_start(&node_iter, b);
-	__bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
-}
-
-/* sort and dedup all keys in the journal: */
-
-void bch2_journal_entries_free(struct bch_fs *c)
-{
-	struct journal_replay **i;
-	struct genradix_iter iter;
-
-	genradix_for_each(&c->journal_entries, iter, i)
-		kvfree(*i);
-	genradix_free(&c->journal_entries);
-}
-
-/*
- * When keys compare equal, oldest compares first:
- */
-static int journal_sort_key_cmp(const void *_l, const void *_r)
-{
-	const struct journal_key *l = _l;
-	const struct journal_key *r = _r;
-
-	return  journal_key_cmp(l, r) ?:
-		cmp_int(l->journal_seq, r->journal_seq) ?:
-		cmp_int(l->journal_offset, r->journal_offset);
-}
-
-void bch2_journal_keys_put(struct bch_fs *c)
-{
-	struct journal_keys *keys = &c->journal_keys;
-
-	BUG_ON(atomic_read(&keys->ref) <= 0);
-
-	if (!atomic_dec_and_test(&keys->ref))
-		return;
-
-	move_gap(keys, keys->nr);
-
-	darray_for_each(*keys, i)
-		if (i->allocated)
-			kfree(i->k);
-
-	kvfree(keys->data);
-	keys->data = NULL;
-	keys->nr = keys->gap = keys->size = 0;
-
-	bch2_journal_entries_free(c);
-}
-
-static void __journal_keys_sort(struct journal_keys *keys)
-{
-	sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL);
-
-	struct journal_key *dst = keys->data;
-
-	darray_for_each(*keys, src) {
-		if (src + 1 < &darray_top(*keys) &&
-		    !journal_key_cmp(src, src + 1))
-			continue;
-
-		*dst++ = *src;
-	}
-
-	keys->nr = dst - keys->data;
-}
-
-int bch2_journal_keys_sort(struct bch_fs *c)
-{
-	struct genradix_iter iter;
-	struct journal_replay *i, **_i;
-	struct journal_keys *keys = &c->journal_keys;
-	size_t nr_read = 0;
-
-	genradix_for_each(&c->journal_entries, iter, _i) {
-		i = *_i;
-
-		if (journal_replay_ignore(i))
-			continue;
-
-		cond_resched();
-
-		for_each_jset_key(k, entry, &i->j) {
-			struct journal_key n = (struct journal_key) {
-				.btree_id	= entry->btree_id,
-				.level		= entry->level,
-				.k		= k,
-				.journal_seq	= le64_to_cpu(i->j.seq),
-				.journal_offset	= k->_data - i->j._data,
-			};
-
-			if (darray_push(keys, n)) {
-				__journal_keys_sort(keys);
-
-				if (keys->nr * 8 > keys->size * 7) {
-					bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu",
-						keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq));
-					return -BCH_ERR_ENOMEM_journal_keys_sort;
-				}
-
-				BUG_ON(darray_push(keys, n));
-			}
-
-			nr_read++;
-		}
-	}
-
-	__journal_keys_sort(keys);
-	keys->gap = keys->nr;
-
-	bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr);
-	return 0;
-}
-
-void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree,
-				  unsigned level_min, unsigned level_max,
-				  struct bpos start, struct bpos end)
-{
-	struct journal_keys *keys = &c->journal_keys;
-	size_t dst = 0;
-
-	move_gap(keys, keys->nr);
-
-	darray_for_each(*keys, i)
-		if (!(i->btree_id == btree &&
-		      i->level >= level_min &&
-		      i->level <= level_max &&
-		      bpos_ge(i->k->k.p, start) &&
-		      bpos_le(i->k->k.p, end)))
-			keys->data[dst++] = *i;
-	keys->nr = keys->gap = dst;
-}
-
-void bch2_journal_keys_dump(struct bch_fs *c)
-{
-	struct journal_keys *keys = &c->journal_keys;
-	struct printbuf buf = PRINTBUF;
-
-	pr_info("%zu keys:", keys->nr);
-
-	move_gap(keys, keys->nr);
-
-	darray_for_each(*keys, i) {
-		printbuf_reset(&buf);
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
-		pr_err("%s l=%u %s", bch2_btree_id_str(i->btree_id), i->level, buf.buf);
-	}
-	printbuf_exit(&buf);
-}
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
deleted file mode 100644
index 1ba4a79b0ef9..000000000000
--- a/fs/bcachefs/btree_journal_iter.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H
-#define _BCACHEFS_BTREE_JOURNAL_ITER_H
-
-struct journal_iter {
-	struct list_head	list;
-	enum btree_id		btree_id;
-	unsigned		level;
-	size_t			idx;
-	struct journal_keys	*keys;
-};
-
-/*
- * Iterate over keys in the btree, with keys from the journal overlaid on top:
- */
-
-struct btree_and_journal_iter {
-	struct btree_trans	*trans;
-	struct btree		*b;
-	struct btree_node_iter	node_iter;
-	struct bkey		unpacked;
-
-	struct journal_iter	journal;
-	struct bpos		pos;
-	bool			at_end;
-	bool			prefetch;
-};
-
-struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
-				unsigned, struct bpos, struct bpos, size_t *);
-struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
-					   unsigned, struct bpos);
-
-int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *,
-					 struct btree_and_journal_iter *);
-
-int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
-				 unsigned, struct bkey_i *);
-int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
-			    unsigned, struct bkey_i *);
-int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
-			    unsigned, struct bpos);
-bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned, struct bpos);
-void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos);
-
-void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
-struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
-
-void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
-				struct btree_and_journal_iter *, struct btree *,
-				struct btree_node_iter, struct bpos);
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
-				struct btree_and_journal_iter *, struct btree *);
-
-void bch2_journal_keys_put(struct bch_fs *);
-
-static inline void bch2_journal_keys_put_initial(struct bch_fs *c)
-{
-	if (c->journal_keys.initial_ref_held)
-		bch2_journal_keys_put(c);
-	c->journal_keys.initial_ref_held = false;
-}
-
-void bch2_journal_entries_free(struct bch_fs *);
-
-int bch2_journal_keys_sort(struct bch_fs *);
-
-void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id,
-				  unsigned, unsigned,
-				  struct bpos, struct bpos);
-
-void bch2_journal_keys_dump(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
deleted file mode 100644
index 75f5e6fe4634..000000000000
--- a/fs/bcachefs/btree_key_cache.c
+++ /dev/null
@@ -1,1091 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "btree_key_cache.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "errcode.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_reclaim.h"
-#include "trace.h"
-
-#include <linux/sched/mm.h>
-
-static inline bool btree_uses_pcpu_readers(enum btree_id id)
-{
-	return id == BTREE_ID_subvolumes;
-}
-
-static struct kmem_cache *bch2_key_cache;
-
-static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
-				       const void *obj)
-{
-	const struct bkey_cached *ck = obj;
-	const struct bkey_cached_key *key = arg->key;
-
-	return ck->key.btree_id != key->btree_id ||
-		!bpos_eq(ck->key.pos, key->pos);
-}
-
-static const struct rhashtable_params bch2_btree_key_cache_params = {
-	.head_offset	= offsetof(struct bkey_cached, hash),
-	.key_offset	= offsetof(struct bkey_cached, key),
-	.key_len	= sizeof(struct bkey_cached_key),
-	.obj_cmpfn	= bch2_btree_key_cache_cmp_fn,
-};
-
-__flatten
-inline struct bkey_cached *
-bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
-{
-	struct bkey_cached_key key = {
-		.btree_id	= btree_id,
-		.pos		= pos,
-	};
-
-	return rhashtable_lookup_fast(&c->btree_key_cache.table, &key,
-				      bch2_btree_key_cache_params);
-}
-
-static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
-{
-	if (!six_trylock_intent(&ck->c.lock))
-		return false;
-
-	if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-		six_unlock_intent(&ck->c.lock);
-		return false;
-	}
-
-	if (!six_trylock_write(&ck->c.lock)) {
-		six_unlock_intent(&ck->c.lock);
-		return false;
-	}
-
-	return true;
-}
-
-static void bkey_cached_evict(struct btree_key_cache *c,
-			      struct bkey_cached *ck)
-{
-	BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
-				      bch2_btree_key_cache_params));
-	memset(&ck->key, ~0, sizeof(ck->key));
-
-	atomic_long_dec(&c->nr_keys);
-}
-
-static void bkey_cached_free(struct btree_key_cache *bc,
-			     struct bkey_cached *ck)
-{
-	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
-
-	BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
-
-	ck->btree_trans_barrier_seq =
-		start_poll_synchronize_srcu(&c->btree_trans_barrier);
-
-	if (ck->c.lock.readers) {
-		list_move_tail(&ck->list, &bc->freed_pcpu);
-		bc->nr_freed_pcpu++;
-	} else {
-		list_move_tail(&ck->list, &bc->freed_nonpcpu);
-		bc->nr_freed_nonpcpu++;
-	}
-	atomic_long_inc(&bc->nr_freed);
-
-	kfree(ck->k);
-	ck->k		= NULL;
-	ck->u64s	= 0;
-
-	six_unlock_write(&ck->c.lock);
-	six_unlock_intent(&ck->c.lock);
-}
-
-#ifdef __KERNEL__
-static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
-						   struct bkey_cached *ck)
-{
-	struct bkey_cached *pos;
-
-	bc->nr_freed_nonpcpu++;
-
-	list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
-		if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
-				 pos->btree_trans_barrier_seq)) {
-			list_move(&ck->list, &pos->list);
-			return;
-		}
-	}
-
-	list_move(&ck->list, &bc->freed_nonpcpu);
-}
-#endif
-
-static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
-					 struct bkey_cached *ck)
-{
-	BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
-
-	if (!ck->c.lock.readers) {
-#ifdef __KERNEL__
-		struct btree_key_cache_freelist *f;
-		bool freed = false;
-
-		preempt_disable();
-		f = this_cpu_ptr(bc->pcpu_freed);
-
-		if (f->nr < ARRAY_SIZE(f->objs)) {
-			f->objs[f->nr++] = ck;
-			freed = true;
-		}
-		preempt_enable();
-
-		if (!freed) {
-			mutex_lock(&bc->lock);
-			preempt_disable();
-			f = this_cpu_ptr(bc->pcpu_freed);
-
-			while (f->nr > ARRAY_SIZE(f->objs) / 2) {
-				struct bkey_cached *ck2 = f->objs[--f->nr];
-
-				__bkey_cached_move_to_freelist_ordered(bc, ck2);
-			}
-			preempt_enable();
-
-			__bkey_cached_move_to_freelist_ordered(bc, ck);
-			mutex_unlock(&bc->lock);
-		}
-#else
-		mutex_lock(&bc->lock);
-		list_move_tail(&ck->list, &bc->freed_nonpcpu);
-		bc->nr_freed_nonpcpu++;
-		mutex_unlock(&bc->lock);
-#endif
-	} else {
-		mutex_lock(&bc->lock);
-		list_move_tail(&ck->list, &bc->freed_pcpu);
-		bc->nr_freed_pcpu++;
-		mutex_unlock(&bc->lock);
-	}
-}
-
-static void bkey_cached_free_fast(struct btree_key_cache *bc,
-				  struct bkey_cached *ck)
-{
-	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
-
-	ck->btree_trans_barrier_seq =
-		start_poll_synchronize_srcu(&c->btree_trans_barrier);
-
-	list_del_init(&ck->list);
-	atomic_long_inc(&bc->nr_freed);
-
-	kfree(ck->k);
-	ck->k		= NULL;
-	ck->u64s	= 0;
-
-	bkey_cached_move_to_freelist(bc, ck);
-
-	six_unlock_write(&ck->c.lock);
-	six_unlock_intent(&ck->c.lock);
-}
-
-static struct bkey_cached *
-bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
-		  bool *was_new)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_key_cache *bc = &c->btree_key_cache;
-	struct bkey_cached *ck = NULL;
-	bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
-	int ret;
-
-	if (!pcpu_readers) {
-#ifdef __KERNEL__
-		struct btree_key_cache_freelist *f;
-
-		preempt_disable();
-		f = this_cpu_ptr(bc->pcpu_freed);
-		if (f->nr)
-			ck = f->objs[--f->nr];
-		preempt_enable();
-
-		if (!ck) {
-			mutex_lock(&bc->lock);
-			preempt_disable();
-			f = this_cpu_ptr(bc->pcpu_freed);
-
-			while (!list_empty(&bc->freed_nonpcpu) &&
-			       f->nr < ARRAY_SIZE(f->objs) / 2) {
-				ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
-				list_del_init(&ck->list);
-				bc->nr_freed_nonpcpu--;
-				f->objs[f->nr++] = ck;
-			}
-
-			ck = f->nr ? f->objs[--f->nr] : NULL;
-			preempt_enable();
-			mutex_unlock(&bc->lock);
-		}
-#else
-		mutex_lock(&bc->lock);
-		if (!list_empty(&bc->freed_nonpcpu)) {
-			ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
-			list_del_init(&ck->list);
-			bc->nr_freed_nonpcpu--;
-		}
-		mutex_unlock(&bc->lock);
-#endif
-	} else {
-		mutex_lock(&bc->lock);
-		if (!list_empty(&bc->freed_pcpu)) {
-			ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
-			list_del_init(&ck->list);
-			bc->nr_freed_pcpu--;
-		}
-		mutex_unlock(&bc->lock);
-	}
-
-	if (ck) {
-		ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
-		if (unlikely(ret)) {
-			bkey_cached_move_to_freelist(bc, ck);
-			return ERR_PTR(ret);
-		}
-
-		path->l[0].b = (void *) ck;
-		path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
-		mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
-
-		ret = bch2_btree_node_lock_write(trans, path, &ck->c);
-		if (unlikely(ret)) {
-			btree_node_unlock(trans, path, 0);
-			bkey_cached_move_to_freelist(bc, ck);
-			return ERR_PTR(ret);
-		}
-
-		return ck;
-	}
-
-	ck = allocate_dropping_locks(trans, ret,
-			kmem_cache_zalloc(bch2_key_cache, _gfp));
-	if (ret) {
-		kmem_cache_free(bch2_key_cache, ck);
-		return ERR_PTR(ret);
-	}
-
-	if (!ck)
-		return NULL;
-
-	INIT_LIST_HEAD(&ck->list);
-	bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
-
-	ck->c.cached = true;
-	BUG_ON(!six_trylock_intent(&ck->c.lock));
-	BUG_ON(!six_trylock_write(&ck->c.lock));
-	*was_new = true;
-	return ck;
-}
-
-static struct bkey_cached *
-bkey_cached_reuse(struct btree_key_cache *c)
-{
-	struct bucket_table *tbl;
-	struct rhash_head *pos;
-	struct bkey_cached *ck;
-	unsigned i;
-
-	mutex_lock(&c->lock);
-	rcu_read_lock();
-	tbl = rht_dereference_rcu(c->table.tbl, &c->table);
-	for (i = 0; i < tbl->size; i++)
-		rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
-			if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
-			    bkey_cached_lock_for_evict(ck)) {
-				bkey_cached_evict(c, ck);
-				goto out;
-			}
-		}
-	ck = NULL;
-out:
-	rcu_read_unlock();
-	mutex_unlock(&c->lock);
-	return ck;
-}
-
-static struct bkey_cached *
-btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_key_cache *bc = &c->btree_key_cache;
-	struct bkey_cached *ck;
-	bool was_new = false;
-
-	ck = bkey_cached_alloc(trans, path, &was_new);
-	if (IS_ERR(ck))
-		return ck;
-
-	if (unlikely(!ck)) {
-		ck = bkey_cached_reuse(bc);
-		if (unlikely(!ck)) {
-			bch_err(c, "error allocating memory for key cache item, btree %s",
-				bch2_btree_id_str(path->btree_id));
-			return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create);
-		}
-
-		mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
-	}
-
-	ck->c.level		= 0;
-	ck->c.btree_id		= path->btree_id;
-	ck->key.btree_id	= path->btree_id;
-	ck->key.pos		= path->pos;
-	ck->valid		= false;
-	ck->flags		= 1U << BKEY_CACHED_ACCESSED;
-
-	if (unlikely(rhashtable_lookup_insert_fast(&bc->table,
-					  &ck->hash,
-					  bch2_btree_key_cache_params))) {
-		/* We raced with another fill: */
-
-		if (likely(was_new)) {
-			six_unlock_write(&ck->c.lock);
-			six_unlock_intent(&ck->c.lock);
-			kfree(ck);
-		} else {
-			bkey_cached_free_fast(bc, ck);
-		}
-
-		mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
-		return NULL;
-	}
-
-	atomic_long_inc(&bc->nr_keys);
-
-	six_unlock_write(&ck->c.lock);
-
-	return ck;
-}
-
-static int btree_key_cache_fill(struct btree_trans *trans,
-				struct btree_path *ck_path,
-				struct bkey_cached *ck)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	unsigned new_u64s = 0;
-	struct bkey_i *new_k = NULL;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos,
-			     BTREE_ITER_key_cache_fill|
-			     BTREE_ITER_cached_nofill);
-	iter.flags &= ~BTREE_ITER_with_journal;
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (!bch2_btree_node_relock(trans, ck_path, 0)) {
-		trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
-		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
-		goto err;
-	}
-
-	/*
-	 * bch2_varint_decode can read past the end of the buffer by at
-	 * most 7 bytes (it won't be used):
-	 */
-	new_u64s = k.k->u64s + 1;
-
-	/*
-	 * Allocate some extra space so that the transaction commit path is less
-	 * likely to have to reallocate, since that requires a transaction
-	 * restart:
-	 */
-	new_u64s = min(256U, (new_u64s * 3) / 2);
-
-	if (new_u64s > ck->u64s) {
-		new_u64s = roundup_pow_of_two(new_u64s);
-		new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
-		if (!new_k) {
-			bch2_trans_unlock(trans);
-
-			new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
-			if (!new_k) {
-				bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
-					bch2_btree_id_str(ck->key.btree_id), new_u64s);
-				ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
-				goto err;
-			}
-
-			if (!bch2_btree_node_relock(trans, ck_path, 0)) {
-				kfree(new_k);
-				trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
-				ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
-				goto err;
-			}
-
-			ret = bch2_trans_relock(trans);
-			if (ret) {
-				kfree(new_k);
-				goto err;
-			}
-		}
-	}
-
-	ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c);
-	if (ret) {
-		kfree(new_k);
-		goto err;
-	}
-
-	if (new_k) {
-		kfree(ck->k);
-		ck->u64s = new_u64s;
-		ck->k = new_k;
-	}
-
-	bkey_reassemble(ck->k, k);
-	ck->valid = true;
-	bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
-
-	/* We're not likely to need this iterator again: */
-	bch2_set_btree_iter_dontneed(&iter);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static noinline int
-bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path,
-					 unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_cached *ck;
-	int ret = 0;
-
-	BUG_ON(path->level);
-
-	path->l[1].b = NULL;
-
-	if (bch2_btree_node_relock_notrace(trans, path, 0)) {
-		ck = (void *) path->l[0].b;
-		goto fill;
-	}
-retry:
-	ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
-	if (!ck) {
-		ck = btree_key_cache_create(trans, path);
-		ret = PTR_ERR_OR_ZERO(ck);
-		if (ret)
-			goto err;
-		if (!ck)
-			goto retry;
-
-		mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
-		path->locks_want = 1;
-	} else {
-		enum six_lock_type lock_want = __btree_lock_want(path, 0);
-
-		ret = btree_node_lock(trans, path, (void *) ck, 0,
-				      lock_want, _THIS_IP_);
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			goto err;
-
-		BUG_ON(ret);
-
-		if (ck->key.btree_id != path->btree_id ||
-		    !bpos_eq(ck->key.pos, path->pos)) {
-			six_unlock_type(&ck->c.lock, lock_want);
-			goto retry;
-		}
-
-		mark_btree_node_locked(trans, path, 0,
-				       (enum btree_node_locked_type) lock_want);
-	}
-
-	path->l[0].lock_seq	= six_lock_seq(&ck->c.lock);
-	path->l[0].b		= (void *) ck;
-fill:
-	path->uptodate = BTREE_ITER_UPTODATE;
-
-	if (!ck->valid && !(flags & BTREE_ITER_cached_nofill)) {
-		ret =   bch2_btree_path_upgrade(trans, path, 1) ?:
-			btree_key_cache_fill(trans, path, ck) ?:
-			bch2_btree_path_relock(trans, path, _THIS_IP_);
-		if (ret)
-			goto err;
-
-		path->uptodate = BTREE_ITER_UPTODATE;
-	}
-
-	if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
-		set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
-
-	BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
-	BUG_ON(path->uptodate);
-
-	return ret;
-err:
-	path->uptodate = BTREE_ITER_NEED_TRAVERSE;
-	if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-		btree_node_unlock(trans, path, 0);
-		path->l[0].b = ERR_PTR(ret);
-	}
-	return ret;
-}
-
-int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
-				    unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_cached *ck;
-	int ret = 0;
-
-	EBUG_ON(path->level);
-
-	path->l[1].b = NULL;
-
-	if (bch2_btree_node_relock_notrace(trans, path, 0)) {
-		ck = (void *) path->l[0].b;
-		goto fill;
-	}
-retry:
-	ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
-	if (!ck) {
-		return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
-	} else {
-		enum six_lock_type lock_want = __btree_lock_want(path, 0);
-
-		ret = btree_node_lock(trans, path, (void *) ck, 0,
-				      lock_want, _THIS_IP_);
-		EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart));
-
-		if (ret)
-			return ret;
-
-		if (ck->key.btree_id != path->btree_id ||
-		    !bpos_eq(ck->key.pos, path->pos)) {
-			six_unlock_type(&ck->c.lock, lock_want);
-			goto retry;
-		}
-
-		mark_btree_node_locked(trans, path, 0,
-				       (enum btree_node_locked_type) lock_want);
-	}
-
-	path->l[0].lock_seq	= six_lock_seq(&ck->c.lock);
-	path->l[0].b		= (void *) ck;
-fill:
-	if (!ck->valid)
-		return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
-
-	if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
-		set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
-
-	path->uptodate = BTREE_ITER_UPTODATE;
-	EBUG_ON(!ck->valid);
-	EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
-
-	return ret;
-}
-
-static int btree_key_cache_flush_pos(struct btree_trans *trans,
-				     struct bkey_cached_key key,
-				     u64 journal_seq,
-				     unsigned commit_flags,
-				     bool evict)
-{
-	struct bch_fs *c = trans->c;
-	struct journal *j = &c->journal;
-	struct btree_iter c_iter, b_iter;
-	struct bkey_cached *ck = NULL;
-	int ret;
-
-	bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
-			     BTREE_ITER_slots|
-			     BTREE_ITER_intent|
-			     BTREE_ITER_all_snapshots);
-	bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
-			     BTREE_ITER_cached|
-			     BTREE_ITER_intent);
-	b_iter.flags &= ~BTREE_ITER_with_key_cache;
-
-	ret = bch2_btree_iter_traverse(&c_iter);
-	if (ret)
-		goto out;
-
-	ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b;
-	if (!ck)
-		goto out;
-
-	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-		if (evict)
-			goto evict;
-		goto out;
-	}
-
-	BUG_ON(!ck->valid);
-
-	if (journal_seq && ck->journal.seq != journal_seq)
-		goto out;
-
-	trans->journal_res.seq = ck->journal.seq;
-
-	/*
-	 * If we're at the end of the journal, we really want to free up space
-	 * in the journal right away - we don't want to pin that old journal
-	 * sequence number with a new btree node write, we want to re-journal
-	 * the update
-	 */
-	if (ck->journal.seq == journal_last_seq(j))
-		commit_flags |= BCH_WATERMARK_reclaim;
-
-	if (ck->journal.seq != journal_last_seq(j) ||
-	    !test_bit(JOURNAL_space_low, &c->journal.flags))
-		commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
-
-	ret   = bch2_btree_iter_traverse(&b_iter) ?:
-		bch2_trans_update(trans, &b_iter, ck->k,
-				  BTREE_UPDATE_key_cache_reclaim|
-				  BTREE_UPDATE_internal_snapshot_node|
-				  BTREE_TRIGGER_norun) ?:
-		bch2_trans_commit(trans, NULL, NULL,
-				  BCH_TRANS_COMMIT_no_check_rw|
-				  BCH_TRANS_COMMIT_no_enospc|
-				  commit_flags);
-
-	bch2_fs_fatal_err_on(ret &&
-			     !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
-			     !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
-			     !bch2_journal_error(j), c,
-			     "flushing key cache: %s", bch2_err_str(ret));
-	if (ret)
-		goto out;
-
-	bch2_journal_pin_drop(j, &ck->journal);
-
-	struct btree_path *path = btree_iter_path(trans, &c_iter);
-	BUG_ON(!btree_node_locked(path, 0));
-
-	if (!evict) {
-		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-			clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
-			atomic_long_dec(&c->btree_key_cache.nr_dirty);
-		}
-	} else {
-		struct btree_path *path2;
-		unsigned i;
-evict:
-		trans_for_each_path(trans, path2, i)
-			if (path2 != path)
-				__bch2_btree_path_unlock(trans, path2);
-
-		bch2_btree_node_lock_write_nofail(trans, path, &ck->c);
-
-		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-			clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
-			atomic_long_dec(&c->btree_key_cache.nr_dirty);
-		}
-
-		mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
-		bkey_cached_evict(&c->btree_key_cache, ck);
-		bkey_cached_free_fast(&c->btree_key_cache, ck);
-	}
-out:
-	bch2_trans_iter_exit(trans, &b_iter);
-	bch2_trans_iter_exit(trans, &c_iter);
-	return ret;
-}
-
-int bch2_btree_key_cache_journal_flush(struct journal *j,
-				struct journal_entry_pin *pin, u64 seq)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct bkey_cached *ck =
-		container_of(pin, struct bkey_cached, journal);
-	struct bkey_cached_key key;
-	struct btree_trans *trans = bch2_trans_get(c);
-	int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
-	int ret = 0;
-
-	btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read);
-	key = ck->key;
-
-	if (ck->journal.seq != seq ||
-	    !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-		six_unlock_read(&ck->c.lock);
-		goto unlock;
-	}
-
-	if (ck->seq != seq) {
-		bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal,
-					bch2_btree_key_cache_journal_flush);
-		six_unlock_read(&ck->c.lock);
-		goto unlock;
-	}
-	six_unlock_read(&ck->c.lock);
-
-	ret = lockrestart_do(trans,
-		btree_key_cache_flush_pos(trans, key, seq,
-				BCH_TRANS_COMMIT_journal_reclaim, false));
-unlock:
-	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
-
-	bch2_trans_put(trans);
-	return ret;
-}
-
-bool bch2_btree_insert_key_cached(struct btree_trans *trans,
-				  unsigned flags,
-				  struct btree_insert_entry *insert_entry)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b;
-	struct bkey_i *insert = insert_entry->k;
-	bool kick_reclaim = false;
-
-	BUG_ON(insert->k.u64s > ck->u64s);
-
-	bkey_copy(ck->k, insert);
-	ck->valid = true;
-
-	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-		EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
-		set_bit(BKEY_CACHED_DIRTY, &ck->flags);
-		atomic_long_inc(&c->btree_key_cache.nr_dirty);
-
-		if (bch2_nr_btree_keys_need_flush(c))
-			kick_reclaim = true;
-	}
-
-	/*
-	 * To minimize lock contention, we only add the journal pin here and
-	 * defer pin updates to the flush callback via ->seq. Be careful not to
-	 * update ->seq on nojournal commits because we don't want to update the
-	 * pin to a seq that doesn't include journal updates on disk. Otherwise
-	 * we risk losing the update after a crash.
-	 *
-	 * The only exception is if the pin is not active in the first place. We
-	 * have to add the pin because journal reclaim drives key cache
-	 * flushing. The flush callback will not proceed unless ->seq matches
-	 * the latest pin, so make sure it starts with a consistent value.
-	 */
-	if (!(insert_entry->flags & BTREE_UPDATE_nojournal) ||
-	    !journal_pin_active(&ck->journal)) {
-		ck->seq = trans->journal_res.seq;
-	}
-	bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
-			     &ck->journal, bch2_btree_key_cache_journal_flush);
-
-	if (kick_reclaim)
-		journal_reclaim_kick(&c->journal);
-	return true;
-}
-
-void bch2_btree_key_cache_drop(struct btree_trans *trans,
-			       struct btree_path *path)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_cached *ck = (void *) path->l[0].b;
-
-	BUG_ON(!ck->valid);
-
-	/*
-	 * We just did an update to the btree, bypassing the key cache: the key
-	 * cache key is now stale and must be dropped, even if dirty:
-	 */
-	if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-		clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
-		atomic_long_dec(&c->btree_key_cache.nr_dirty);
-		bch2_journal_pin_drop(&c->journal, &ck->journal);
-	}
-
-	ck->valid = false;
-}
-
-static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
-					   struct shrink_control *sc)
-{
-	struct bch_fs *c = shrink->private_data;
-	struct btree_key_cache *bc = &c->btree_key_cache;
-	struct bucket_table *tbl;
-	struct bkey_cached *ck, *t;
-	size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
-	unsigned start, flags;
-	int srcu_idx;
-
-	mutex_lock(&bc->lock);
-	bc->requested_to_free += sc->nr_to_scan;
-
-	srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
-	flags = memalloc_nofs_save();
-
-	/*
-	 * Newest freed entries are at the end of the list - once we hit one
-	 * that's too new to be freed, we can bail out:
-	 */
-	list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
-		if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
-						 ck->btree_trans_barrier_seq))
-			break;
-
-		list_del(&ck->list);
-		six_lock_exit(&ck->c.lock);
-		kmem_cache_free(bch2_key_cache, ck);
-		atomic_long_dec(&bc->nr_freed);
-		freed++;
-		bc->nr_freed_nonpcpu--;
-		bc->freed++;
-	}
-
-	list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
-		if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
-						 ck->btree_trans_barrier_seq))
-			break;
-
-		list_del(&ck->list);
-		six_lock_exit(&ck->c.lock);
-		kmem_cache_free(bch2_key_cache, ck);
-		atomic_long_dec(&bc->nr_freed);
-		freed++;
-		bc->nr_freed_pcpu--;
-		bc->freed++;
-	}
-
-	rcu_read_lock();
-	tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
-	if (bc->shrink_iter >= tbl->size)
-		bc->shrink_iter = 0;
-	start = bc->shrink_iter;
-
-	do {
-		struct rhash_head *pos, *next;
-
-		pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
-
-		while (!rht_is_a_nulls(pos)) {
-			next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
-			ck = container_of(pos, struct bkey_cached, hash);
-
-			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-				bc->skipped_dirty++;
-				goto next;
-			} else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) {
-				clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
-				bc->skipped_accessed++;
-				goto next;
-			} else if (bkey_cached_lock_for_evict(ck)) {
-				bkey_cached_evict(bc, ck);
-				bkey_cached_free(bc, ck);
-				bc->moved_to_freelist++;
-			} else {
-				bc->skipped_lock_fail++;
-			}
-
-			scanned++;
-			if (scanned >= nr)
-				break;
-next:
-			pos = next;
-		}
-
-		bc->shrink_iter++;
-		if (bc->shrink_iter >= tbl->size)
-			bc->shrink_iter = 0;
-	} while (scanned < nr && bc->shrink_iter != start);
-
-	rcu_read_unlock();
-	memalloc_nofs_restore(flags);
-	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
-	mutex_unlock(&bc->lock);
-
-	return freed;
-}
-
-static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
-					    struct shrink_control *sc)
-{
-	struct bch_fs *c = shrink->private_data;
-	struct btree_key_cache *bc = &c->btree_key_cache;
-	long nr = atomic_long_read(&bc->nr_keys) -
-		atomic_long_read(&bc->nr_dirty);
-
-	return max(0L, nr);
-}
-
-void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
-{
-	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
-	struct bucket_table *tbl;
-	struct bkey_cached *ck, *n;
-	struct rhash_head *pos;
-	LIST_HEAD(items);
-	unsigned i;
-#ifdef __KERNEL__
-	int cpu;
-#endif
-
-	shrinker_free(bc->shrink);
-
-	mutex_lock(&bc->lock);
-
-	/*
-	 * The loop is needed to guard against racing with rehash:
-	 */
-	while (atomic_long_read(&bc->nr_keys)) {
-		rcu_read_lock();
-		tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
-		if (tbl)
-			for (i = 0; i < tbl->size; i++)
-				rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
-					bkey_cached_evict(bc, ck);
-					list_add(&ck->list, &items);
-				}
-		rcu_read_unlock();
-	}
-
-#ifdef __KERNEL__
-	if (bc->pcpu_freed) {
-		for_each_possible_cpu(cpu) {
-			struct btree_key_cache_freelist *f =
-				per_cpu_ptr(bc->pcpu_freed, cpu);
-
-			for (i = 0; i < f->nr; i++) {
-				ck = f->objs[i];
-				list_add(&ck->list, &items);
-			}
-		}
-	}
-#endif
-
-	BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
-	BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
-
-	list_splice(&bc->freed_pcpu,	&items);
-	list_splice(&bc->freed_nonpcpu,	&items);
-
-	mutex_unlock(&bc->lock);
-
-	list_for_each_entry_safe(ck, n, &items, list) {
-		cond_resched();
-
-		list_del(&ck->list);
-		kfree(ck->k);
-		six_lock_exit(&ck->c.lock);
-		kmem_cache_free(bch2_key_cache, ck);
-	}
-
-	if (atomic_long_read(&bc->nr_dirty) &&
-	    !bch2_journal_error(&c->journal) &&
-	    test_bit(BCH_FS_was_rw, &c->flags))
-		panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n",
-		      atomic_long_read(&bc->nr_dirty));
-
-	if (atomic_long_read(&bc->nr_keys))
-		panic("btree key cache shutdown error: nr_keys nonzero (%li)\n",
-		      atomic_long_read(&bc->nr_keys));
-
-	if (bc->table_init_done)
-		rhashtable_destroy(&bc->table);
-
-	free_percpu(bc->pcpu_freed);
-}
-
-void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
-{
-	mutex_init(&c->lock);
-	INIT_LIST_HEAD(&c->freed_pcpu);
-	INIT_LIST_HEAD(&c->freed_nonpcpu);
-}
-
-int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
-{
-	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
-	struct shrinker *shrink;
-
-#ifdef __KERNEL__
-	bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
-	if (!bc->pcpu_freed)
-		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
-#endif
-
-	if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
-		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
-
-	bc->table_init_done = true;
-
-	shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name);
-	if (!shrink)
-		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
-	bc->shrink = shrink;
-	shrink->seeks		= 0;
-	shrink->count_objects	= bch2_btree_key_cache_count;
-	shrink->scan_objects	= bch2_btree_key_cache_scan;
-	shrink->private_data	= c;
-	shrinker_register(shrink);
-	return 0;
-}
-
-void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc)
-{
-	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
-
-	printbuf_tabstop_push(out, 24);
-	printbuf_tabstop_push(out, 12);
-
-	unsigned flags = memalloc_nofs_save();
-	mutex_lock(&bc->lock);
-	prt_printf(out, "keys:\t%lu\r\n",		atomic_long_read(&bc->nr_keys));
-	prt_printf(out, "dirty:\t%lu\r\n",		atomic_long_read(&bc->nr_dirty));
-	prt_printf(out, "freelist:\t%lu\r\n",		atomic_long_read(&bc->nr_freed));
-	prt_printf(out, "nonpcpu freelist:\t%zu\r\n",	bc->nr_freed_nonpcpu);
-	prt_printf(out, "pcpu freelist:\t%zu\r\n",	bc->nr_freed_pcpu);
-
-	prt_printf(out, "\nshrinker:\n");
-	prt_printf(out, "requested_to_free:\t%lu\r\n",	bc->requested_to_free);
-	prt_printf(out, "freed:\t%lu\r\n",		bc->freed);
-	prt_printf(out, "moved_to_freelist:\t%lu\r\n",	bc->moved_to_freelist);
-	prt_printf(out, "skipped_dirty:\t%lu\r\n",	bc->skipped_dirty);
-	prt_printf(out, "skipped_accessed:\t%lu\r\n",	bc->skipped_accessed);
-	prt_printf(out, "skipped_lock_fail:\t%lu\r\n",	bc->skipped_lock_fail);
-
-	prt_printf(out, "srcu seq:\t%lu\r\n",		get_state_synchronize_srcu(&c->btree_trans_barrier));
-
-	struct bkey_cached *ck;
-	unsigned iter = 0;
-	list_for_each_entry(ck, &bc->freed_nonpcpu, list) {
-		prt_printf(out, "freed_nonpcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
-		if (++iter > 10)
-			break;
-	}
-
-	iter = 0;
-	list_for_each_entry(ck, &bc->freed_pcpu, list) {
-		prt_printf(out, "freed_pcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
-		if (++iter > 10)
-			break;
-	}
-	mutex_unlock(&bc->lock);
-	memalloc_flags_restore(flags);
-}
-
-void bch2_btree_key_cache_exit(void)
-{
-	kmem_cache_destroy(bch2_key_cache);
-}
-
-int __init bch2_btree_key_cache_init(void)
-{
-	bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT);
-	if (!bch2_key_cache)
-		return -ENOMEM;
-
-	return 0;
-}
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
deleted file mode 100644
index e6b2cd0dd2c1..000000000000
--- a/fs/bcachefs/btree_key_cache.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_KEY_CACHE_H
-#define _BCACHEFS_BTREE_KEY_CACHE_H
-
-static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
-{
-	size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
-	size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
-	size_t max_dirty = 1024 + nr_keys  / 2;
-
-	return max_t(ssize_t, 0, nr_dirty - max_dirty);
-}
-
-static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
-{
-	size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
-	size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
-	size_t max_dirty = 4096 + (nr_keys * 3) / 4;
-
-	return nr_dirty > max_dirty;
-}
-
-int bch2_btree_key_cache_journal_flush(struct journal *,
-				struct journal_entry_pin *, u64);
-
-struct bkey_cached *
-bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
-
-int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *,
-				    unsigned);
-
-bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned,
-			struct btree_insert_entry *);
-void bch2_btree_key_cache_drop(struct btree_trans *,
-			       struct btree_path *);
-
-void bch2_fs_btree_key_cache_exit(struct btree_key_cache *);
-void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *);
-int bch2_fs_btree_key_cache_init(struct btree_key_cache *);
-
-void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *);
-
-void bch2_btree_key_cache_exit(void);
-int __init bch2_btree_key_cache_init(void);
-
-#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */
diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h
deleted file mode 100644
index 237e8bb3ac40..000000000000
--- a/fs/bcachefs/btree_key_cache_types.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
-#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
-
-struct btree_key_cache_freelist {
-	struct bkey_cached	*objs[16];
-	unsigned		nr;
-};
-
-struct btree_key_cache {
-	struct mutex		lock;
-	struct rhashtable	table;
-	bool			table_init_done;
-
-	struct list_head	freed_pcpu;
-	size_t			nr_freed_pcpu;
-	struct list_head	freed_nonpcpu;
-	size_t			nr_freed_nonpcpu;
-
-	struct shrinker		*shrink;
-	unsigned		shrink_iter;
-	struct btree_key_cache_freelist __percpu *pcpu_freed;
-
-	atomic_long_t		nr_freed;
-	atomic_long_t		nr_keys;
-	atomic_long_t		nr_dirty;
-
-	/* shrinker stats */
-	unsigned long		requested_to_free;
-	unsigned long		freed;
-	unsigned long		moved_to_freelist;
-	unsigned long		skipped_dirty;
-	unsigned long		skipped_accessed;
-	unsigned long		skipped_lock_fail;
-};
-
-struct bkey_cached_key {
-	u32			btree_id;
-	struct bpos		pos;
-} __packed __aligned(4);
-
-#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
deleted file mode 100644
index c3e9b0cc7bbd..000000000000
--- a/fs/bcachefs/btree_locking.c
+++ /dev/null
@@ -1,898 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_locking.h"
-#include "btree_types.h"
-
-static struct lock_class_key bch2_btree_node_lock_key;
-
-void bch2_btree_lock_init(struct btree_bkey_cached_common *b,
-			  enum six_lock_init_flags flags)
-{
-	__six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags);
-	lockdep_set_novalidate_class(&b->lock);
-}
-
-#ifdef CONFIG_LOCKDEP
-void bch2_assert_btree_nodes_not_locked(void)
-{
-#if 0
-	//Re-enable when lock_class_is_held() is merged:
-	BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
-#endif
-}
-#endif
-
-/* Btree node locking: */
-
-struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
-						  struct btree_path *skip,
-						  struct btree_bkey_cached_common *b,
-						  unsigned level)
-{
-	struct btree_path *path;
-	struct six_lock_count ret;
-	unsigned i;
-
-	memset(&ret, 0, sizeof(ret));
-
-	if (IS_ERR_OR_NULL(b))
-		return ret;
-
-	trans_for_each_path(trans, path, i)
-		if (path != skip && &path->l[level].b->c == b) {
-			int t = btree_node_locked_type(path, level);
-
-			if (t != BTREE_NODE_UNLOCKED)
-				ret.n[t]++;
-		}
-
-	return ret;
-}
-
-/* unlock */
-
-void bch2_btree_node_unlock_write(struct btree_trans *trans,
-			struct btree_path *path, struct btree *b)
-{
-	bch2_btree_node_unlock_write_inlined(trans, path, b);
-}
-
-/* lock */
-
-/*
- * @trans wants to lock @b with type @type
- */
-struct trans_waiting_for_lock {
-	struct btree_trans		*trans;
-	struct btree_bkey_cached_common	*node_want;
-	enum six_lock_type		lock_want;
-
-	/* for iterating over held locks :*/
-	u8				path_idx;
-	u8				level;
-	u64				lock_start_time;
-};
-
-struct lock_graph {
-	struct trans_waiting_for_lock	g[8];
-	unsigned			nr;
-};
-
-static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
-{
-	struct trans_waiting_for_lock *i;
-
-	prt_printf(out, "Found lock cycle (%u entries):\n", g->nr);
-
-	for (i = g->g; i < g->g + g->nr; i++) {
-		struct task_struct *task = READ_ONCE(i->trans->locking_wait.task);
-		if (!task)
-			continue;
-
-		bch2_btree_trans_to_text(out, i->trans);
-		bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1, GFP_NOWAIT);
-	}
-}
-
-static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
-{
-	struct trans_waiting_for_lock *i;
-
-	for (i = g->g; i != g->g + g->nr; i++) {
-		struct task_struct *task = i->trans->locking_wait.task;
-		if (i != g->g)
-			prt_str(out, "<- ");
-		prt_printf(out, "%u ", task ?task->pid : 0);
-	}
-	prt_newline(out);
-}
-
-static void lock_graph_up(struct lock_graph *g)
-{
-	closure_put(&g->g[--g->nr].trans->ref);
-}
-
-static noinline void lock_graph_pop_all(struct lock_graph *g)
-{
-	while (g->nr)
-		lock_graph_up(g);
-}
-
-static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
-{
-	g->g[g->nr++] = (struct trans_waiting_for_lock) {
-		.trans		= trans,
-		.node_want	= trans->locking,
-		.lock_want	= trans->locking_wait.lock_want,
-	};
-}
-
-static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
-{
-	closure_get(&trans->ref);
-	__lock_graph_down(g, trans);
-}
-
-static bool lock_graph_remove_non_waiters(struct lock_graph *g)
-{
-	struct trans_waiting_for_lock *i;
-
-	for (i = g->g + 1; i < g->g + g->nr; i++)
-		if (i->trans->locking != i->node_want ||
-		    i->trans->locking_wait.start_time != i[-1].lock_start_time) {
-			while (g->g + g->nr > i)
-				lock_graph_up(g);
-			return true;
-		}
-
-	return false;
-}
-
-static void trace_would_deadlock(struct lock_graph *g, struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-
-	count_event(c, trans_restart_would_deadlock);
-
-	if (trace_trans_restart_would_deadlock_enabled()) {
-		struct printbuf buf = PRINTBUF;
-
-		buf.atomic++;
-		print_cycle(&buf, g);
-
-		trace_trans_restart_would_deadlock(trans, buf.buf);
-		printbuf_exit(&buf);
-	}
-}
-
-static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
-{
-	if (i == g->g) {
-		trace_would_deadlock(g, i->trans);
-		return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
-	} else {
-		i->trans->lock_must_abort = true;
-		wake_up_process(i->trans->locking_wait.task);
-		return 0;
-	}
-}
-
-static int btree_trans_abort_preference(struct btree_trans *trans)
-{
-	if (trans->lock_may_not_fail)
-		return 0;
-	if (trans->locking_wait.lock_want == SIX_LOCK_write)
-		return 1;
-	if (!trans->in_traverse_all)
-		return 2;
-	return 3;
-}
-
-static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
-{
-	struct trans_waiting_for_lock *i, *abort = NULL;
-	unsigned best = 0, pref;
-	int ret;
-
-	if (lock_graph_remove_non_waiters(g))
-		return 0;
-
-	/* Only checking, for debugfs: */
-	if (cycle) {
-		print_cycle(cycle, g);
-		ret = -1;
-		goto out;
-	}
-
-	for (i = g->g; i < g->g + g->nr; i++) {
-		pref = btree_trans_abort_preference(i->trans);
-		if (pref > best) {
-			abort = i;
-			best = pref;
-		}
-	}
-
-	if (unlikely(!best)) {
-		struct printbuf buf = PRINTBUF;
-
-		prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
-
-		for (i = g->g; i < g->g + g->nr; i++) {
-			struct btree_trans *trans = i->trans;
-
-			bch2_btree_trans_to_text(&buf, trans);
-
-			prt_printf(&buf, "backtrace:\n");
-			printbuf_indent_add(&buf, 2);
-			bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT);
-			printbuf_indent_sub(&buf, 2);
-			prt_newline(&buf);
-		}
-
-		bch2_print_string_as_lines(KERN_ERR, buf.buf);
-		printbuf_exit(&buf);
-		BUG();
-	}
-
-	ret = abort_lock(g, abort);
-out:
-	if (ret)
-		while (g->nr)
-			lock_graph_up(g);
-	return ret;
-}
-
-static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
-			      struct printbuf *cycle)
-{
-	struct btree_trans *orig_trans = g->g->trans;
-	struct trans_waiting_for_lock *i;
-
-	for (i = g->g; i < g->g + g->nr; i++)
-		if (i->trans == trans) {
-			closure_put(&trans->ref);
-			return break_cycle(g, cycle);
-		}
-
-	if (g->nr == ARRAY_SIZE(g->g)) {
-		closure_put(&trans->ref);
-
-		if (orig_trans->lock_may_not_fail)
-			return 0;
-
-		while (g->nr)
-			lock_graph_up(g);
-
-		if (cycle)
-			return 0;
-
-		trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_);
-		return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
-	}
-
-	__lock_graph_down(g, trans);
-	return 0;
-}
-
-static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2)
-{
-	return t1 + t2 > 1;
-}
-
-int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
-{
-	struct lock_graph g;
-	struct trans_waiting_for_lock *top;
-	struct btree_bkey_cached_common *b;
-	btree_path_idx_t path_idx;
-	int ret = 0;
-
-	g.nr = 0;
-
-	if (trans->lock_must_abort) {
-		if (cycle)
-			return -1;
-
-		trace_would_deadlock(&g, trans);
-		return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
-	}
-
-	lock_graph_down(&g, trans);
-
-	/* trans->paths is rcu protected vs. freeing */
-	rcu_read_lock();
-	if (cycle)
-		cycle->atomic++;
-next:
-	if (!g.nr)
-		goto out;
-
-	top = &g.g[g.nr - 1];
-
-	struct btree_path *paths = rcu_dereference(top->trans->paths);
-	if (!paths)
-		goto up;
-
-	unsigned long *paths_allocated = trans_paths_allocated(paths);
-
-	trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths),
-				     path_idx, top->path_idx) {
-		struct btree_path *path = paths + path_idx;
-		if (!path->nodes_locked)
-			continue;
-
-		if (path_idx != top->path_idx) {
-			top->path_idx		= path_idx;
-			top->level		= 0;
-			top->lock_start_time	= 0;
-		}
-
-		for (;
-		     top->level < BTREE_MAX_DEPTH;
-		     top->level++, top->lock_start_time = 0) {
-			int lock_held = btree_node_locked_type(path, top->level);
-
-			if (lock_held == BTREE_NODE_UNLOCKED)
-				continue;
-
-			b = &READ_ONCE(path->l[top->level].b)->c;
-
-			if (IS_ERR_OR_NULL(b)) {
-				/*
-				 * If we get here, it means we raced with the
-				 * other thread updating its btree_path
-				 * structures - which means it can't be blocked
-				 * waiting on a lock:
-				 */
-				if (!lock_graph_remove_non_waiters(&g)) {
-					/*
-					 * If lock_graph_remove_non_waiters()
-					 * didn't do anything, it must be
-					 * because we're being called by debugfs
-					 * checking for lock cycles, which
-					 * invokes us on btree_transactions that
-					 * aren't actually waiting on anything.
-					 * Just bail out:
-					 */
-					lock_graph_pop_all(&g);
-				}
-
-				goto next;
-			}
-
-			if (list_empty_careful(&b->lock.wait_list))
-				continue;
-
-			raw_spin_lock(&b->lock.wait_lock);
-			list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) {
-				BUG_ON(b != trans->locking);
-
-				if (top->lock_start_time &&
-				    time_after_eq64(top->lock_start_time, trans->locking_wait.start_time))
-					continue;
-
-				top->lock_start_time = trans->locking_wait.start_time;
-
-				/* Don't check for self deadlock: */
-				if (trans == top->trans ||
-				    !lock_type_conflicts(lock_held, trans->locking_wait.lock_want))
-					continue;
-
-				closure_get(&trans->ref);
-				raw_spin_unlock(&b->lock.wait_lock);
-
-				ret = lock_graph_descend(&g, trans, cycle);
-				if (ret)
-					goto out;
-				goto next;
-
-			}
-			raw_spin_unlock(&b->lock.wait_lock);
-		}
-	}
-up:
-	if (g.nr > 1 && cycle)
-		print_chain(cycle, &g);
-	lock_graph_up(&g);
-	goto next;
-out:
-	if (cycle)
-		--cycle->atomic;
-	rcu_read_unlock();
-	return ret;
-}
-
-int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
-{
-	struct btree_trans *trans = p;
-
-	return bch2_check_for_deadlock(trans, NULL);
-}
-
-int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path,
-				 struct btree_bkey_cached_common *b,
-				 bool lock_may_not_fail)
-{
-	int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read];
-	int ret;
-
-	/*
-	 * Must drop our read locks before calling six_lock_write() -
-	 * six_unlock() won't do wakeups until the reader count
-	 * goes to 0, and it's safe because we have the node intent
-	 * locked:
-	 */
-	six_lock_readers_add(&b->lock, -readers);
-	ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write,
-				       lock_may_not_fail, _RET_IP_);
-	six_lock_readers_add(&b->lock, readers);
-
-	if (ret)
-		mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_INTENT_LOCKED);
-
-	return ret;
-}
-
-void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
-				       struct btree_path *path,
-				       struct btree_bkey_cached_common *b)
-{
-	int ret = __btree_node_lock_write(trans, path, b, true);
-	BUG_ON(ret);
-}
-
-/* relock */
-
-static inline bool btree_path_get_locks(struct btree_trans *trans,
-					struct btree_path *path,
-					bool upgrade,
-					struct get_locks_fail *f)
-{
-	unsigned l = path->level;
-	int fail_idx = -1;
-
-	do {
-		if (!btree_path_node(path, l))
-			break;
-
-		if (!(upgrade
-		      ? bch2_btree_node_upgrade(trans, path, l)
-		      : bch2_btree_node_relock(trans, path, l))) {
-			fail_idx	= l;
-
-			if (f) {
-				f->l	= l;
-				f->b	= path->l[l].b;
-			}
-		}
-
-		l++;
-	} while (l < path->locks_want);
-
-	/*
-	 * When we fail to get a lock, we have to ensure that any child nodes
-	 * can't be relocked so bch2_btree_path_traverse has to walk back up to
-	 * the node that we failed to relock:
-	 */
-	if (fail_idx >= 0) {
-		__bch2_btree_path_unlock(trans, path);
-		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-
-		do {
-			path->l[fail_idx].b = upgrade
-				? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
-				: ERR_PTR(-BCH_ERR_no_btree_node_relock);
-			--fail_idx;
-		} while (fail_idx >= 0);
-	}
-
-	if (path->uptodate == BTREE_ITER_NEED_RELOCK)
-		path->uptodate = BTREE_ITER_UPTODATE;
-
-	return path->uptodate < BTREE_ITER_NEED_RELOCK;
-}
-
-bool __bch2_btree_node_relock(struct btree_trans *trans,
-			      struct btree_path *path, unsigned level,
-			      bool trace)
-{
-	struct btree *b = btree_path_node(path, level);
-	int want = __btree_lock_want(path, level);
-
-	if (race_fault())
-		goto fail;
-
-	if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
-	    (btree_node_lock_seq_matches(path, b, level) &&
-	     btree_node_lock_increment(trans, &b->c, level, want))) {
-		mark_btree_node_locked(trans, path, level, want);
-		return true;
-	}
-fail:
-	if (trace && !trans->notrace_relock_fail)
-		trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level);
-	return false;
-}
-
-/* upgrade */
-
-bool bch2_btree_node_upgrade(struct btree_trans *trans,
-			     struct btree_path *path, unsigned level)
-{
-	struct btree *b = path->l[level].b;
-	struct six_lock_count count = bch2_btree_node_lock_counts(trans, path, &b->c, level);
-
-	if (!is_btree_node(path, level))
-		return false;
-
-	switch (btree_lock_want(path, level)) {
-	case BTREE_NODE_UNLOCKED:
-		BUG_ON(btree_node_locked(path, level));
-		return true;
-	case BTREE_NODE_READ_LOCKED:
-		BUG_ON(btree_node_intent_locked(path, level));
-		return bch2_btree_node_relock(trans, path, level);
-	case BTREE_NODE_INTENT_LOCKED:
-		break;
-	case BTREE_NODE_WRITE_LOCKED:
-		BUG();
-	}
-
-	if (btree_node_intent_locked(path, level))
-		return true;
-
-	if (race_fault())
-		return false;
-
-	if (btree_node_locked(path, level)) {
-		bool ret;
-
-		six_lock_readers_add(&b->c.lock, -count.n[SIX_LOCK_read]);
-		ret = six_lock_tryupgrade(&b->c.lock);
-		six_lock_readers_add(&b->c.lock, count.n[SIX_LOCK_read]);
-
-		if (ret)
-			goto success;
-	} else {
-		if (six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
-			goto success;
-	}
-
-	/*
-	 * Do we already have an intent lock via another path? If so, just bump
-	 * lock count:
-	 */
-	if (btree_node_lock_seq_matches(path, b, level) &&
-	    btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) {
-		btree_node_unlock(trans, path, level);
-		goto success;
-	}
-
-	trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level);
-	return false;
-success:
-	mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
-	return true;
-}
-
-/* Btree path locking: */
-
-/*
- * Only for btree_cache.c - only relocks intent locks
- */
-int bch2_btree_path_relock_intent(struct btree_trans *trans,
-				  struct btree_path *path)
-{
-	unsigned l;
-
-	for (l = path->level;
-	     l < path->locks_want && btree_path_node(path, l);
-	     l++) {
-		if (!bch2_btree_node_relock(trans, path, l)) {
-			__bch2_btree_path_unlock(trans, path);
-			btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-			trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path);
-			return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
-		}
-	}
-
-	return 0;
-}
-
-__flatten
-bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path)
-{
-	struct get_locks_fail f;
-
-	bool ret = btree_path_get_locks(trans, path, false, &f);
-	bch2_trans_verify_locks(trans);
-	return ret;
-}
-
-int __bch2_btree_path_relock(struct btree_trans *trans,
-			struct btree_path *path, unsigned long trace_ip)
-{
-	if (!bch2_btree_path_relock_norestart(trans, path)) {
-		trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
-		return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
-	}
-
-	return 0;
-}
-
-bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
-			       struct btree_path *path,
-			       unsigned new_locks_want,
-			       struct get_locks_fail *f)
-{
-	EBUG_ON(path->locks_want >= new_locks_want);
-
-	path->locks_want = new_locks_want;
-
-	bool ret = btree_path_get_locks(trans, path, true, f);
-	bch2_trans_verify_locks(trans);
-	return ret;
-}
-
-bool __bch2_btree_path_upgrade(struct btree_trans *trans,
-			       struct btree_path *path,
-			       unsigned new_locks_want,
-			       struct get_locks_fail *f)
-{
-	bool ret = bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f);
-	if (ret)
-		goto out;
-
-	/*
-	 * XXX: this is ugly - we'd prefer to not be mucking with other
-	 * iterators in the btree_trans here.
-	 *
-	 * On failure to upgrade the iterator, setting iter->locks_want and
-	 * calling get_locks() is sufficient to make bch2_btree_path_traverse()
-	 * get the locks we want on transaction restart.
-	 *
-	 * But if this iterator was a clone, on transaction restart what we did
-	 * to this iterator isn't going to be preserved.
-	 *
-	 * Possibly we could add an iterator field for the parent iterator when
-	 * an iterator is a copy - for now, we'll just upgrade any other
-	 * iterators with the same btree id.
-	 *
-	 * The code below used to be needed to ensure ancestor nodes get locked
-	 * before interior nodes - now that's handled by
-	 * bch2_btree_path_traverse_all().
-	 */
-	if (!path->cached && !trans->in_traverse_all) {
-		struct btree_path *linked;
-		unsigned i;
-
-		trans_for_each_path(trans, linked, i)
-			if (linked != path &&
-			    linked->cached == path->cached &&
-			    linked->btree_id == path->btree_id &&
-			    linked->locks_want < new_locks_want) {
-				linked->locks_want = new_locks_want;
-				btree_path_get_locks(trans, linked, true, NULL);
-			}
-	}
-out:
-	bch2_trans_verify_locks(trans);
-	return ret;
-}
-
-void __bch2_btree_path_downgrade(struct btree_trans *trans,
-				 struct btree_path *path,
-				 unsigned new_locks_want)
-{
-	unsigned l, old_locks_want = path->locks_want;
-
-	if (trans->restarted)
-		return;
-
-	EBUG_ON(path->locks_want < new_locks_want);
-
-	path->locks_want = new_locks_want;
-
-	while (path->nodes_locked &&
-	       (l = btree_path_highest_level_locked(path)) >= path->locks_want) {
-		if (l > path->level) {
-			btree_node_unlock(trans, path, l);
-		} else {
-			if (btree_node_intent_locked(path, l)) {
-				six_lock_downgrade(&path->l[l].b->c.lock);
-				mark_btree_node_locked_noreset(path, l, BTREE_NODE_READ_LOCKED);
-			}
-			break;
-		}
-	}
-
-	bch2_btree_path_verify_locks(path);
-
-	trace_path_downgrade(trans, _RET_IP_, path, old_locks_want);
-}
-
-/* Btree transaction locking: */
-
-void bch2_trans_downgrade(struct btree_trans *trans)
-{
-	struct btree_path *path;
-	unsigned i;
-
-	if (trans->restarted)
-		return;
-
-	trans_for_each_path(trans, path, i)
-		if (path->ref)
-			bch2_btree_path_downgrade(trans, path);
-}
-
-static inline void __bch2_trans_unlock(struct btree_trans *trans)
-{
-	struct btree_path *path;
-	unsigned i;
-
-	trans_for_each_path(trans, path, i)
-		__bch2_btree_path_unlock(trans, path);
-}
-
-static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path,
-						  struct get_locks_fail *f, bool trace)
-{
-	if (!trace)
-		goto out;
-
-	if (trace_trans_restart_relock_enabled()) {
-		struct printbuf buf = PRINTBUF;
-
-		bch2_bpos_to_text(&buf, path->pos);
-		prt_printf(&buf, " l=%u seq=%u node seq=", f->l, path->l[f->l].lock_seq);
-		if (IS_ERR_OR_NULL(f->b)) {
-			prt_str(&buf, bch2_err_str(PTR_ERR(f->b)));
-		} else {
-			prt_printf(&buf, "%u", f->b->c.lock.seq);
-
-			struct six_lock_count c =
-				bch2_btree_node_lock_counts(trans, NULL, &f->b->c, f->l);
-			prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
-
-			c = six_lock_counts(&f->b->c.lock);
-			prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
-		}
-
-		trace_trans_restart_relock(trans, _RET_IP_, buf.buf);
-		printbuf_exit(&buf);
-	}
-
-	count_event(trans->c, trans_restart_relock);
-out:
-	__bch2_trans_unlock(trans);
-	bch2_trans_verify_locks(trans);
-	return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
-}
-
-static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace)
-{
-	bch2_trans_verify_locks(trans);
-
-	if (unlikely(trans->restarted))
-		return -((int) trans->restarted);
-	if (unlikely(trans->locked))
-		goto out;
-
-	struct btree_path *path;
-	unsigned i;
-
-	trans_for_each_path(trans, path, i) {
-		struct get_locks_fail f;
-
-		if (path->should_be_locked &&
-		    !btree_path_get_locks(trans, path, false, &f))
-			return bch2_trans_relock_fail(trans, path, &f, trace);
-	}
-
-	trans->locked = true;
-out:
-	bch2_trans_verify_locks(trans);
-	return 0;
-}
-
-int bch2_trans_relock(struct btree_trans *trans)
-{
-	return __bch2_trans_relock(trans, true);
-}
-
-int bch2_trans_relock_notrace(struct btree_trans *trans)
-{
-	return __bch2_trans_relock(trans, false);
-}
-
-void bch2_trans_unlock_noassert(struct btree_trans *trans)
-{
-	__bch2_trans_unlock(trans);
-
-	trans->locked = false;
-	trans->last_unlock_ip = _RET_IP_;
-}
-
-void bch2_trans_unlock(struct btree_trans *trans)
-{
-	__bch2_trans_unlock(trans);
-
-	trans->locked = false;
-	trans->last_unlock_ip = _RET_IP_;
-}
-
-void bch2_trans_unlock_long(struct btree_trans *trans)
-{
-	bch2_trans_unlock(trans);
-	bch2_trans_srcu_unlock(trans);
-}
-
-int __bch2_trans_mutex_lock(struct btree_trans *trans,
-			    struct mutex *lock)
-{
-	int ret = drop_locks_do(trans, (mutex_lock(lock), 0));
-
-	if (ret)
-		mutex_unlock(lock);
-	return ret;
-}
-
-/* Debug */
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-void bch2_btree_path_verify_locks(struct btree_path *path)
-{
-	/*
-	 * A path may be uptodate and yet have nothing locked if and only if
-	 * there is no node at path->level, which generally means we were
-	 * iterating over all nodes and got to the end of the btree
-	 */
-	BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
-	       btree_path_node(path, path->level) &&
-	       !path->nodes_locked);
-
-	if (!path->nodes_locked)
-		return;
-
-	for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) {
-		int want = btree_lock_want(path, l);
-		int have = btree_node_locked_type(path, l);
-
-		BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED);
-
-		BUG_ON(is_btree_node(path, l) &&
-		       (want == BTREE_NODE_UNLOCKED ||
-			have != BTREE_NODE_WRITE_LOCKED) &&
-		       want != have);
-	}
-}
-
-static bool bch2_trans_locked(struct btree_trans *trans)
-{
-	struct btree_path *path;
-	unsigned i;
-
-	trans_for_each_path(trans, path, i)
-		if (path->nodes_locked)
-			return true;
-	return false;
-}
-
-void bch2_trans_verify_locks(struct btree_trans *trans)
-{
-	if (!trans->locked) {
-		BUG_ON(bch2_trans_locked(trans));
-		return;
-	}
-
-	struct btree_path *path;
-	unsigned i;
-
-	trans_for_each_path(trans, path, i)
-		bch2_btree_path_verify_locks(path);
-}
-
-#endif
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
deleted file mode 100644
index 7f41545b9147..000000000000
--- a/fs/bcachefs/btree_locking.h
+++ /dev/null
@@ -1,424 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_LOCKING_H
-#define _BCACHEFS_BTREE_LOCKING_H
-
-/*
- * Only for internal btree use:
- *
- * The btree iterator tracks what locks it wants to take, and what locks it
- * currently has - here we have wrappers for locking/unlocking btree nodes and
- * updating the iterator state
- */
-
-#include "btree_iter.h"
-#include "six.h"
-
-void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags);
-
-#ifdef CONFIG_LOCKDEP
-void bch2_assert_btree_nodes_not_locked(void);
-#else
-static inline void bch2_assert_btree_nodes_not_locked(void) {}
-#endif
-
-void bch2_trans_unlock_noassert(struct btree_trans *);
-
-static inline bool is_btree_node(struct btree_path *path, unsigned l)
-{
-	return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b);
-}
-
-static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans)
-{
-	return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats)
-		? &trans->c->btree_transaction_stats[trans->fn_idx]
-		: NULL;
-}
-
-/* matches six lock types */
-enum btree_node_locked_type {
-	BTREE_NODE_UNLOCKED		= -1,
-	BTREE_NODE_READ_LOCKED		= SIX_LOCK_read,
-	BTREE_NODE_INTENT_LOCKED	= SIX_LOCK_intent,
-	BTREE_NODE_WRITE_LOCKED		= SIX_LOCK_write,
-};
-
-static inline int btree_node_locked_type(struct btree_path *path,
-					 unsigned level)
-{
-	return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3);
-}
-
-static inline bool btree_node_write_locked(struct btree_path *path, unsigned l)
-{
-	return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED;
-}
-
-static inline bool btree_node_intent_locked(struct btree_path *path, unsigned l)
-{
-	return btree_node_locked_type(path, l) == BTREE_NODE_INTENT_LOCKED;
-}
-
-static inline bool btree_node_read_locked(struct btree_path *path, unsigned l)
-{
-	return btree_node_locked_type(path, l) == BTREE_NODE_READ_LOCKED;
-}
-
-static inline bool btree_node_locked(struct btree_path *path, unsigned level)
-{
-	return btree_node_locked_type(path, level) != BTREE_NODE_UNLOCKED;
-}
-
-static inline void mark_btree_node_locked_noreset(struct btree_path *path,
-						  unsigned level,
-						  enum btree_node_locked_type type)
-{
-	/* relying on this to avoid a branch */
-	BUILD_BUG_ON(SIX_LOCK_read   != 0);
-	BUILD_BUG_ON(SIX_LOCK_intent != 1);
-
-	path->nodes_locked &= ~(3U << (level << 1));
-	path->nodes_locked |= (type + 1) << (level << 1);
-}
-
-static inline void mark_btree_node_unlocked(struct btree_path *path,
-					    unsigned level)
-{
-	EBUG_ON(btree_node_write_locked(path, level));
-	mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
-}
-
-static inline void mark_btree_node_locked(struct btree_trans *trans,
-					  struct btree_path *path,
-					  unsigned level,
-					  enum btree_node_locked_type type)
-{
-	mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type);
-#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
-	path->l[level].lock_taken_time = local_clock();
-#endif
-}
-
-static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
-{
-	return level < path->locks_want
-		? SIX_LOCK_intent
-		: SIX_LOCK_read;
-}
-
-static inline enum btree_node_locked_type
-btree_lock_want(struct btree_path *path, int level)
-{
-	if (level < path->level)
-		return BTREE_NODE_UNLOCKED;
-	if (level < path->locks_want)
-		return BTREE_NODE_INTENT_LOCKED;
-	if (level == path->level)
-		return BTREE_NODE_READ_LOCKED;
-	return BTREE_NODE_UNLOCKED;
-}
-
-static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
-					      struct btree_path *path, unsigned level)
-{
-#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
-	__bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
-				 path->l[level].lock_taken_time,
-				 local_clock());
-#endif
-}
-
-/* unlock: */
-
-static inline void btree_node_unlock(struct btree_trans *trans,
-				     struct btree_path *path, unsigned level)
-{
-	int lock_type = btree_node_locked_type(path, level);
-
-	EBUG_ON(level >= BTREE_MAX_DEPTH);
-
-	if (lock_type != BTREE_NODE_UNLOCKED) {
-		six_unlock_type(&path->l[level].b->c.lock, lock_type);
-		btree_trans_lock_hold_time_update(trans, path, level);
-	}
-	mark_btree_node_unlocked(path, level);
-}
-
-static inline int btree_path_lowest_level_locked(struct btree_path *path)
-{
-	return __ffs(path->nodes_locked) >> 1;
-}
-
-static inline int btree_path_highest_level_locked(struct btree_path *path)
-{
-	return __fls(path->nodes_locked) >> 1;
-}
-
-static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
-					    struct btree_path *path)
-{
-	btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK);
-
-	while (path->nodes_locked)
-		btree_node_unlock(trans, path, btree_path_lowest_level_locked(path));
-}
-
-/*
- * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
- * succeed:
- */
-static inline void
-bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
-				     struct btree *b)
-{
-	struct btree_path *linked;
-	unsigned i;
-
-	EBUG_ON(path->l[b->c.level].b != b);
-	EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock));
-	EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
-
-	mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
-
-	trans_for_each_path_with_node(trans, b, linked, i)
-		linked->l[b->c.level].lock_seq++;
-
-	six_unlock_write(&b->c.lock);
-}
-
-void bch2_btree_node_unlock_write(struct btree_trans *,
-			struct btree_path *, struct btree *);
-
-int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
-
-/* lock: */
-
-static inline int __btree_node_lock_nopath(struct btree_trans *trans,
-					 struct btree_bkey_cached_common *b,
-					 enum six_lock_type type,
-					 bool lock_may_not_fail,
-					 unsigned long ip)
-{
-	int ret;
-
-	trans->lock_may_not_fail = lock_may_not_fail;
-	trans->lock_must_abort	= false;
-	trans->locking		= b;
-
-	ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
-				 bch2_six_check_for_deadlock, trans, ip);
-	WRITE_ONCE(trans->locking, NULL);
-	WRITE_ONCE(trans->locking_wait.start_time, 0);
-	return ret;
-}
-
-static inline int __must_check
-btree_node_lock_nopath(struct btree_trans *trans,
-		       struct btree_bkey_cached_common *b,
-		       enum six_lock_type type,
-		       unsigned long ip)
-{
-	return __btree_node_lock_nopath(trans, b, type, false, ip);
-}
-
-static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans,
-					 struct btree_bkey_cached_common *b,
-					 enum six_lock_type type)
-{
-	int ret = __btree_node_lock_nopath(trans, b, type, true, _THIS_IP_);
-
-	BUG_ON(ret);
-}
-
-/*
- * Lock a btree node if we already have it locked on one of our linked
- * iterators:
- */
-static inline bool btree_node_lock_increment(struct btree_trans *trans,
-					     struct btree_bkey_cached_common *b,
-					     unsigned level,
-					     enum btree_node_locked_type want)
-{
-	struct btree_path *path;
-	unsigned i;
-
-	trans_for_each_path(trans, path, i)
-		if (&path->l[level].b->c == b &&
-		    btree_node_locked_type(path, level) >= want) {
-			six_lock_increment(&b->lock, (enum six_lock_type) want);
-			return true;
-		}
-
-	return false;
-}
-
-static inline int btree_node_lock(struct btree_trans *trans,
-			struct btree_path *path,
-			struct btree_bkey_cached_common *b,
-			unsigned level,
-			enum six_lock_type type,
-			unsigned long ip)
-{
-	int ret = 0;
-
-	EBUG_ON(level >= BTREE_MAX_DEPTH);
-
-	if (likely(six_trylock_type(&b->lock, type)) ||
-	    btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
-	    !(ret = btree_node_lock_nopath(trans, b, type, btree_path_ip_allocated(path)))) {
-#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
-		path->l[b->level].lock_taken_time = local_clock();
-#endif
-	}
-
-	return ret;
-}
-
-int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_path *,
-				 struct btree_bkey_cached_common *b, bool);
-
-static inline int __btree_node_lock_write(struct btree_trans *trans,
-					  struct btree_path *path,
-					  struct btree_bkey_cached_common *b,
-					  bool lock_may_not_fail)
-{
-	EBUG_ON(&path->l[b->level].b->c != b);
-	EBUG_ON(path->l[b->level].lock_seq != six_lock_seq(&b->lock));
-	EBUG_ON(!btree_node_intent_locked(path, b->level));
-
-	/*
-	 * six locks are unfair, and read locks block while a thread wants a
-	 * write lock: thus, we need to tell the cycle detector we have a write
-	 * lock _before_ taking the lock:
-	 */
-	mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_WRITE_LOCKED);
-
-	return likely(six_trylock_write(&b->lock))
-		? 0
-		: __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail);
-}
-
-static inline int __must_check
-bch2_btree_node_lock_write(struct btree_trans *trans,
-			   struct btree_path *path,
-			   struct btree_bkey_cached_common *b)
-{
-	return __btree_node_lock_write(trans, path, b, false);
-}
-
-void bch2_btree_node_lock_write_nofail(struct btree_trans *,
-				       struct btree_path *,
-				       struct btree_bkey_cached_common *);
-
-/* relock: */
-
-bool bch2_btree_path_relock_norestart(struct btree_trans *, struct btree_path *);
-int __bch2_btree_path_relock(struct btree_trans *,
-			     struct btree_path *, unsigned long);
-
-static inline int bch2_btree_path_relock(struct btree_trans *trans,
-				struct btree_path *path, unsigned long trace_ip)
-{
-	return btree_node_locked(path, path->level)
-		? 0
-		: __bch2_btree_path_relock(trans, path, trace_ip);
-}
-
-bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned, bool trace);
-
-static inline bool bch2_btree_node_relock(struct btree_trans *trans,
-					  struct btree_path *path, unsigned level)
-{
-	EBUG_ON(btree_node_locked(path, level) &&
-		!btree_node_write_locked(path, level) &&
-		btree_node_locked_type(path, level) != __btree_lock_want(path, level));
-
-	return likely(btree_node_locked(path, level)) ||
-		(!IS_ERR_OR_NULL(path->l[level].b) &&
-		 __bch2_btree_node_relock(trans, path, level, true));
-}
-
-static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
-						  struct btree_path *path, unsigned level)
-{
-	EBUG_ON(btree_node_locked(path, level) &&
-		!btree_node_write_locked(path, level) &&
-		btree_node_locked_type(path, level) != __btree_lock_want(path, level));
-
-	return likely(btree_node_locked(path, level)) ||
-		(!IS_ERR_OR_NULL(path->l[level].b) &&
-		 __bch2_btree_node_relock(trans, path, level, false));
-}
-
-/* upgrade */
-
-bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
-			       struct btree_path *, unsigned,
-			       struct get_locks_fail *);
-
-bool __bch2_btree_path_upgrade(struct btree_trans *,
-			       struct btree_path *, unsigned,
-			       struct get_locks_fail *);
-
-static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
-					  struct btree_path *path,
-					  unsigned new_locks_want)
-{
-	struct get_locks_fail f = {};
-	unsigned old_locks_want = path->locks_want;
-
-	new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
-
-	if (path->locks_want < new_locks_want
-	    ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f)
-	    : path->nodes_locked)
-		return 0;
-
-	trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path,
-			old_locks_want, new_locks_want, &f);
-	return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
-}
-
-/* misc: */
-
-static inline void btree_path_set_should_be_locked(struct btree_path *path)
-{
-	EBUG_ON(!btree_node_locked(path, path->level));
-	EBUG_ON(path->uptodate);
-
-	path->should_be_locked = true;
-}
-
-static inline void __btree_path_set_level_up(struct btree_trans *trans,
-				      struct btree_path *path,
-				      unsigned l)
-{
-	btree_node_unlock(trans, path, l);
-	path->l[l].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
-}
-
-static inline void btree_path_set_level_up(struct btree_trans *trans,
-				    struct btree_path *path)
-{
-	__btree_path_set_level_up(trans, path, path->level++);
-	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-}
-
-/* debug */
-
-struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *,
-				struct btree_path *,
-				struct btree_bkey_cached_common *b,
-				unsigned);
-
-int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *);
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_path_verify_locks(struct btree_path *);
-void bch2_trans_verify_locks(struct btree_trans *);
-#else
-static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
-static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
-#endif
-
-#endif /* _BCACHEFS_BTREE_LOCKING_H */
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
deleted file mode 100644
index 45cb8149d374..000000000000
--- a/fs/bcachefs/btree_node_scan.c
+++ /dev/null
@@ -1,524 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_journal_iter.h"
-#include "btree_node_scan.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "error.h"
-#include "journal_io.h"
-#include "recovery_passes.h"
-
-#include <linux/kthread.h>
-#include <linux/sort.h>
-
-struct find_btree_nodes_worker {
-	struct closure		*cl;
-	struct find_btree_nodes	*f;
-	struct bch_dev		*ca;
-};
-
-static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
-{
-	prt_printf(out, "%s l=%u seq=%u cookie=%llx ", bch2_btree_id_str(n->btree_id), n->level, n->seq, n->cookie);
-	bch2_bpos_to_text(out, n->min_key);
-	prt_str(out, "-");
-	bch2_bpos_to_text(out, n->max_key);
-
-	if (n->range_updated)
-		prt_str(out, " range updated");
-	if (n->overwritten)
-		prt_str(out, " overwritten");
-
-	for (unsigned i = 0; i < n->nr_ptrs; i++) {
-		prt_char(out, ' ');
-		bch2_extent_ptr_to_text(out, c, n->ptrs + i);
-	}
-}
-
-static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes)
-{
-	printbuf_indent_add(out, 2);
-	darray_for_each(nodes, i) {
-		found_btree_node_to_text(out, c, i);
-		prt_newline(out);
-	}
-	printbuf_indent_sub(out, 2);
-}
-
-static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f)
-{
-	struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k);
-
-	set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs);
-	bp->k.p			= f->max_key;
-	bp->v.seq		= cpu_to_le64(f->cookie);
-	bp->v.sectors_written	= 0;
-	bp->v.flags		= 0;
-	bp->v.sectors_written	= cpu_to_le16(f->sectors_written);
-	bp->v.min_key		= f->min_key;
-	SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated);
-	memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
-}
-
-static bool found_btree_node_is_readable(struct btree_trans *trans,
-					 struct found_btree_node *f)
-{
-	struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } k;
-
-	found_btree_node_to_key(&k.k, f);
-
-	struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false);
-	bool ret = !IS_ERR_OR_NULL(b);
-	if (ret) {
-		f->sectors_written = b->written;
-		six_unlock_read(&b->c.lock);
-	}
-
-	/*
-	 * We might update this node's range; if that happens, we need the node
-	 * to be re-read so the read path can trim keys that are no longer in
-	 * this node
-	 */
-	if (b != btree_node_root(trans->c, b))
-		bch2_btree_node_evict(trans, &k.k);
-	return ret;
-}
-
-static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
-{
-	const struct found_btree_node *l = _l;
-	const struct found_btree_node *r = _r;
-
-	return  cmp_int(l->btree_id,	r->btree_id) ?:
-		cmp_int(l->level,	r->level) ?:
-		cmp_int(l->cookie,	r->cookie);
-}
-
-/*
- * Given two found btree nodes, if their sequence numbers are equal, take the
- * one that's readable:
- */
-static int found_btree_node_cmp_time(const struct found_btree_node *l,
-				     const struct found_btree_node *r)
-{
-	return cmp_int(l->seq, r->seq);
-}
-
-static int found_btree_node_cmp_pos(const void *_l, const void *_r)
-{
-	const struct found_btree_node *l = _l;
-	const struct found_btree_node *r = _r;
-
-	return  cmp_int(l->btree_id,	r->btree_id) ?:
-	       -cmp_int(l->level,	r->level) ?:
-		bpos_cmp(l->min_key,	r->min_key) ?:
-	       -found_btree_node_cmp_time(l, r);
-}
-
-static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
-				struct bio *bio, struct btree_node *bn, u64 offset)
-{
-	struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
-
-	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
-	bio->bi_iter.bi_sector	= offset;
-	bch2_bio_map(bio, bn, PAGE_SIZE);
-
-	submit_bio_wait(bio);
-	if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
-			       "IO error in try_read_btree_node() at %llu: %s",
-			       offset, bch2_blk_status_to_str(bio->bi_status)))
-		return;
-
-	if (le64_to_cpu(bn->magic) != bset_magic(c))
-		return;
-
-	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
-		struct nonce nonce = btree_nonce(&bn->keys, 0);
-		unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
-
-		bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes);
-	}
-
-	if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
-		return;
-
-	if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
-		return;
-
-	rcu_read_lock();
-	struct found_btree_node n = {
-		.btree_id	= BTREE_NODE_ID(bn),
-		.level		= BTREE_NODE_LEVEL(bn),
-		.seq		= BTREE_NODE_SEQ(bn),
-		.cookie		= le64_to_cpu(bn->keys.seq),
-		.min_key	= bn->min_key,
-		.max_key	= bn->max_key,
-		.nr_ptrs	= 1,
-		.ptrs[0].type	= 1 << BCH_EXTENT_ENTRY_ptr,
-		.ptrs[0].offset	= offset,
-		.ptrs[0].dev	= ca->dev_idx,
-		.ptrs[0].gen	= *bucket_gen(ca, sector_to_bucket(ca, offset)),
-	};
-	rcu_read_unlock();
-
-	if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) {
-		mutex_lock(&f->lock);
-		if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
-			bch_err(c, "try_read_btree_node() can't handle endian conversion");
-			f->ret = -EINVAL;
-			goto unlock;
-		}
-
-		if (darray_push(&f->nodes, n))
-			f->ret = -ENOMEM;
-unlock:
-		mutex_unlock(&f->lock);
-	}
-}
-
-static int read_btree_nodes_worker(void *p)
-{
-	struct find_btree_nodes_worker *w = p;
-	struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
-	struct bch_dev *ca = w->ca;
-	void *buf = (void *) __get_free_page(GFP_KERNEL);
-	struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
-	unsigned long last_print = jiffies;
-
-	if (!buf || !bio) {
-		bch_err(c, "read_btree_nodes_worker: error allocating bio/buf");
-		w->f->ret = -ENOMEM;
-		goto err;
-	}
-
-	for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++)
-		for (unsigned bucket_offset = 0;
-		     bucket_offset + btree_sectors(c) <= ca->mi.bucket_size;
-		     bucket_offset += btree_sectors(c)) {
-			if (time_after(jiffies, last_print + HZ * 30)) {
-				u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset;
-				u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size;
-
-				bch_info(ca, "%s: %2u%% done", __func__,
-					 (unsigned) div64_u64(cur_sector * 100, end_sector));
-				last_print = jiffies;
-			}
-
-			u64 sector = bucket * ca->mi.bucket_size + bucket_offset;
-
-			if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap &&
-			    !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
-				continue;
-
-			try_read_btree_node(w->f, ca, bio, buf, sector);
-		}
-err:
-	bio_put(bio);
-	free_page((unsigned long) buf);
-	percpu_ref_get(&ca->io_ref);
-	closure_put(w->cl);
-	kfree(w);
-	return 0;
-}
-
-static int read_btree_nodes(struct find_btree_nodes *f)
-{
-	struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
-	struct closure cl;
-	int ret = 0;
-
-	closure_init_stack(&cl);
-
-	for_each_online_member(c, ca) {
-		if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
-			continue;
-
-		struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
-		struct task_struct *t;
-
-		if (!w) {
-			percpu_ref_put(&ca->io_ref);
-			ret = -ENOMEM;
-			goto err;
-		}
-
-		percpu_ref_get(&ca->io_ref);
-		closure_get(&cl);
-		w->cl		= &cl;
-		w->f		= f;
-		w->ca		= ca;
-
-		t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
-		ret = IS_ERR_OR_NULL(t);
-		if (ret) {
-			percpu_ref_put(&ca->io_ref);
-			closure_put(&cl);
-			f->ret = ret;
-			bch_err(c, "error starting kthread: %i", ret);
-			break;
-		}
-	}
-err:
-	closure_sync(&cl);
-	return f->ret ?: ret;
-}
-
-static void bubble_up(struct found_btree_node *n, struct found_btree_node *end)
-{
-	while (n + 1 < end &&
-	       found_btree_node_cmp_pos(n, n + 1) > 0) {
-		swap(n[0], n[1]);
-		n++;
-	}
-}
-
-static int handle_overwrites(struct bch_fs *c,
-			     struct found_btree_node *start,
-			     struct found_btree_node *end)
-{
-	struct found_btree_node *n;
-again:
-	for (n = start + 1;
-	     n < end &&
-	     n->btree_id	== start->btree_id &&
-	     n->level		== start->level &&
-	     bpos_lt(n->min_key, start->max_key);
-	     n++)  {
-		int cmp = found_btree_node_cmp_time(start, n);
-
-		if (cmp > 0) {
-			if (bpos_cmp(start->max_key, n->max_key) >= 0)
-				n->overwritten = true;
-			else {
-				n->range_updated = true;
-				n->min_key = bpos_successor(start->max_key);
-				n->range_updated = true;
-				bubble_up(n, end);
-				goto again;
-			}
-		} else if (cmp < 0) {
-			BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0);
-
-			start->max_key = bpos_predecessor(n->min_key);
-			start->range_updated = true;
-		} else if (n->level) {
-			n->overwritten = true;
-		} else {
-			struct printbuf buf = PRINTBUF;
-
-			prt_str(&buf, "overlapping btree nodes with same seq! halting\n  ");
-			found_btree_node_to_text(&buf, c, start);
-			prt_str(&buf, "\n  ");
-			found_btree_node_to_text(&buf, c, n);
-			bch_err(c, "%s", buf.buf);
-			printbuf_exit(&buf);
-			return -BCH_ERR_fsck_repair_unimplemented;
-		}
-	}
-
-	return 0;
-}
-
-int bch2_scan_for_btree_nodes(struct bch_fs *c)
-{
-	struct find_btree_nodes *f = &c->found_btree_nodes;
-	struct printbuf buf = PRINTBUF;
-	size_t dst;
-	int ret = 0;
-
-	if (f->nodes.nr)
-		return 0;
-
-	mutex_init(&f->lock);
-
-	ret = read_btree_nodes(f);
-	if (ret)
-		return ret;
-
-	if (!f->nodes.nr) {
-		bch_err(c, "%s: no btree nodes found", __func__);
-		ret = -EINVAL;
-		goto err;
-	}
-
-	if (0 && c->opts.verbose) {
-		printbuf_reset(&buf);
-		prt_printf(&buf, "%s: nodes found:\n", __func__);
-		found_btree_nodes_to_text(&buf, c, f->nodes);
-		bch2_print_string_as_lines(KERN_INFO, buf.buf);
-	}
-
-	sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
-
-	dst = 0;
-	darray_for_each(f->nodes, i) {
-		struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL;
-
-		if (prev &&
-		    prev->cookie == i->cookie) {
-			if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) {
-				bch_err(c, "%s: found too many replicas for btree node", __func__);
-				ret = -EINVAL;
-				goto err;
-			}
-			prev->ptrs[prev->nr_ptrs++] = i->ptrs[0];
-		} else {
-			f->nodes.data[dst++] = *i;
-		}
-	}
-	f->nodes.nr = dst;
-
-	sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
-
-	if (0 && c->opts.verbose) {
-		printbuf_reset(&buf);
-		prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
-		found_btree_nodes_to_text(&buf, c, f->nodes);
-		bch2_print_string_as_lines(KERN_INFO, buf.buf);
-	}
-
-	dst = 0;
-	darray_for_each(f->nodes, i) {
-		if (i->overwritten)
-			continue;
-
-		ret = handle_overwrites(c, i, &darray_top(f->nodes));
-		if (ret)
-			goto err;
-
-		BUG_ON(i->overwritten);
-		f->nodes.data[dst++] = *i;
-	}
-	f->nodes.nr = dst;
-
-	if (c->opts.verbose) {
-		printbuf_reset(&buf);
-		prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
-		found_btree_nodes_to_text(&buf, c, f->nodes);
-		bch2_print_string_as_lines(KERN_INFO, buf.buf);
-	}
-
-	eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
-err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int found_btree_node_range_start_cmp(const void *_l, const void *_r)
-{
-	const struct found_btree_node *l = _l;
-	const struct found_btree_node *r = _r;
-
-	return  cmp_int(l->btree_id,	r->btree_id) ?:
-	       -cmp_int(l->level,	r->level) ?:
-		bpos_cmp(l->max_key,	r->min_key);
-}
-
-#define for_each_found_btree_node_in_range(_f, _search, _idx)				\
-	for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr,		\
-					sizeof((_f)->nodes.data[0]),			\
-					found_btree_node_range_start_cmp, &search);	\
-	     _idx < (_f)->nodes.nr &&							\
-	     (_f)->nodes.data[_idx].btree_id == _search.btree_id &&			\
-	     (_f)->nodes.data[_idx].level == _search.level &&				\
-	     bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key);			\
-	     _idx = eytzinger0_next(_idx, (_f)->nodes.nr))
-
-bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
-{
-	struct find_btree_nodes *f = &c->found_btree_nodes;
-
-	struct found_btree_node search = {
-		.btree_id	= b->c.btree_id,
-		.level		= b->c.level,
-		.min_key	= b->data->min_key,
-		.max_key	= b->key.k.p,
-	};
-
-	for_each_found_btree_node_in_range(f, search, idx)
-		if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data))
-			return true;
-	return false;
-}
-
-bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
-{
-	struct found_btree_node search = {
-		.btree_id	= btree,
-		.level		= 0,
-		.min_key	= POS_MIN,
-		.max_key	= SPOS_MAX,
-	};
-
-	for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx)
-		return true;
-	return false;
-}
-
-int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
-			   unsigned level, struct bpos node_min, struct bpos node_max)
-{
-	if (btree_id_is_alloc(btree))
-		return 0;
-
-	struct find_btree_nodes *f = &c->found_btree_nodes;
-
-	int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
-	if (ret)
-		return ret;
-
-	if (c->opts.verbose) {
-		struct printbuf buf = PRINTBUF;
-
-		prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level);
-		bch2_bpos_to_text(&buf, node_min);
-		prt_str(&buf, " - ");
-		bch2_bpos_to_text(&buf, node_max);
-
-		bch_info(c, "%s(): %s", __func__, buf.buf);
-		printbuf_exit(&buf);
-	}
-
-	struct found_btree_node search = {
-		.btree_id	= btree,
-		.level		= level,
-		.min_key	= node_min,
-		.max_key	= node_max,
-	};
-
-	for_each_found_btree_node_in_range(f, search, idx) {
-		struct found_btree_node n = f->nodes.data[idx];
-
-		n.range_updated |= bpos_lt(n.min_key, node_min);
-		n.min_key = bpos_max(n.min_key, node_min);
-
-		n.range_updated |= bpos_gt(n.max_key, node_max);
-		n.max_key = bpos_min(n.max_key, node_max);
-
-		struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
-
-		found_btree_node_to_key(&tmp.k, &n);
-
-		struct printbuf buf = PRINTBUF;
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
-		bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
-		printbuf_exit(&buf);
-
-		BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL));
-
-		ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-void bch2_find_btree_nodes_exit(struct find_btree_nodes *f)
-{
-	darray_exit(&f->nodes);
-}
diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h
deleted file mode 100644
index 08687b209787..000000000000
--- a/fs/bcachefs/btree_node_scan.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_NODE_SCAN_H
-#define _BCACHEFS_BTREE_NODE_SCAN_H
-
-int bch2_scan_for_btree_nodes(struct bch_fs *);
-bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *);
-bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id);
-int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos);
-void bch2_find_btree_nodes_exit(struct find_btree_nodes *);
-
-#endif /* _BCACHEFS_BTREE_NODE_SCAN_H */
diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h
deleted file mode 100644
index 5cfaeb5ac831..000000000000
--- a/fs/bcachefs/btree_node_scan_types.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
-#define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
-
-#include "darray.h"
-
-struct found_btree_node {
-	bool			range_updated:1;
-	bool			overwritten:1;
-	u8			btree_id;
-	u8			level;
-	unsigned		sectors_written;
-	u32			seq;
-	u64			cookie;
-
-	struct bpos		min_key;
-	struct bpos		max_key;
-
-	unsigned		nr_ptrs;
-	struct bch_extent_ptr	ptrs[BCH_REPLICAS_MAX];
-};
-
-typedef DARRAY(struct found_btree_node)	found_btree_nodes;
-
-struct find_btree_nodes {
-	int			ret;
-	struct mutex		lock;
-	found_btree_nodes	nodes;
-};
-
-#endif /* _BCACHEFS_BTREE_NODE_SCAN_TYPES_H */
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
deleted file mode 100644
index 74e1ff225674..000000000000
--- a/fs/bcachefs/btree_trans_commit.c
+++ /dev/null
@@ -1,1155 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "btree_gc.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_journal_iter.h"
-#include "btree_key_cache.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "errcode.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-#include "replicas.h"
-#include "snapshot.h"
-
-#include <linux/prefetch.h>
-
-static const char * const trans_commit_flags_strs[] = {
-#define x(n, ...) #n,
-	BCH_TRANS_COMMIT_FLAGS()
-#undef x
-	NULL
-};
-
-void bch2_trans_commit_flags_to_text(struct printbuf *out, enum bch_trans_commit_flags flags)
-{
-	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
-
-	prt_printf(out, "watermark=%s", bch2_watermarks[watermark]);
-
-	flags >>= BCH_WATERMARK_BITS;
-	if (flags) {
-		prt_char(out, ' ');
-		bch2_prt_bitflags(out, trans_commit_flags_strs, flags);
-	}
-}
-
-static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-	struct bch_fs *c = trans->c;
-	struct bkey u;
-	struct bkey_s_c k = bch2_btree_path_peek_slot_exact(trans->paths + i->path, &u);
-
-	if (unlikely(trans->journal_replay_not_finished)) {
-		struct bkey_i *j_k =
-			bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p);
-
-		if (j_k)
-			k = bkey_i_to_s_c(j_k);
-	}
-
-	u = *k.k;
-	u.needs_whiteout = i->old_k.needs_whiteout;
-
-	BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey)));
-	BUG_ON(i->old_v != k.v);
-#endif
-}
-
-static inline struct btree_path_level *insert_l(struct btree_trans *trans, struct btree_insert_entry *i)
-{
-	return (trans->paths + i->path)->l + i->level;
-}
-
-static inline bool same_leaf_as_prev(struct btree_trans *trans,
-				     struct btree_insert_entry *i)
-{
-	return i != trans->updates &&
-		insert_l(trans, &i[0])->b == insert_l(trans, &i[-1])->b;
-}
-
-static inline bool same_leaf_as_next(struct btree_trans *trans,
-				     struct btree_insert_entry *i)
-{
-	return i + 1 < trans->updates + trans->nr_updates &&
-		insert_l(trans, &i[0])->b == insert_l(trans, &i[1])->b;
-}
-
-inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
-					   struct btree_path *path,
-					   struct btree *b)
-{
-	struct bch_fs *c = trans->c;
-
-	if (unlikely(btree_node_just_written(b)) &&
-	    bch2_btree_post_write_cleanup(c, b))
-		bch2_trans_node_reinit_iter(trans, b);
-
-	/*
-	 * If the last bset has been written, or if it's gotten too big - start
-	 * a new bset to insert into:
-	 */
-	if (want_new_bset(c, b))
-		bch2_btree_init_next(trans, b);
-}
-
-static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
-{
-	while (--i >= trans->updates) {
-		if (same_leaf_as_prev(trans, i))
-			continue;
-
-		bch2_btree_node_unlock_write(trans, trans->paths + i->path, insert_l(trans, i)->b);
-	}
-
-	trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
-	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
-}
-
-static inline int bch2_trans_lock_write(struct btree_trans *trans)
-{
-	EBUG_ON(trans->write_locked);
-
-	trans_for_each_update(trans, i) {
-		if (same_leaf_as_prev(trans, i))
-			continue;
-
-		if (bch2_btree_node_lock_write(trans, trans->paths + i->path, &insert_l(trans, i)->b->c))
-			return trans_lock_write_fail(trans, i);
-
-		if (!i->cached)
-			bch2_btree_node_prep_for_write(trans, trans->paths + i->path, insert_l(trans, i)->b);
-	}
-
-	trans->write_locked = true;
-	return 0;
-}
-
-static inline void bch2_trans_unlock_write(struct btree_trans *trans)
-{
-	if (likely(trans->write_locked)) {
-		trans_for_each_update(trans, i)
-			if (!same_leaf_as_prev(trans, i))
-				bch2_btree_node_unlock_write_inlined(trans,
-						trans->paths + i->path, insert_l(trans, i)->b);
-		trans->write_locked = false;
-	}
-}
-
-/* Inserting into a given leaf node (last stage of insert): */
-
-/* Handle overwrites and do insert, for non extents: */
-bool bch2_btree_bset_insert_key(struct btree_trans *trans,
-				struct btree_path *path,
-				struct btree *b,
-				struct btree_node_iter *node_iter,
-				struct bkey_i *insert)
-{
-	struct bkey_packed *k;
-	unsigned clobber_u64s = 0, new_u64s = 0;
-
-	EBUG_ON(btree_node_just_written(b));
-	EBUG_ON(bset_written(b, btree_bset_last(b)));
-	EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
-	EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
-	EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
-	EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b));
-	EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos));
-
-	k = bch2_btree_node_iter_peek_all(node_iter, b);
-	if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
-		k = NULL;
-
-	/* @k is the key being overwritten/deleted, if any: */
-	EBUG_ON(k && bkey_deleted(k));
-
-	/* Deleting, but not found? nothing to do: */
-	if (bkey_deleted(&insert->k) && !k)
-		return false;
-
-	if (bkey_deleted(&insert->k)) {
-		/* Deleting: */
-		btree_account_key_drop(b, k);
-		k->type = KEY_TYPE_deleted;
-
-		if (k->needs_whiteout)
-			push_whiteout(b, insert->k.p);
-		k->needs_whiteout = false;
-
-		if (k >= btree_bset_last(b)->start) {
-			clobber_u64s = k->u64s;
-			bch2_bset_delete(b, k, clobber_u64s);
-			goto fix_iter;
-		} else {
-			bch2_btree_path_fix_key_modified(trans, b, k);
-		}
-
-		return true;
-	}
-
-	if (k) {
-		/* Overwriting: */
-		btree_account_key_drop(b, k);
-		k->type = KEY_TYPE_deleted;
-
-		insert->k.needs_whiteout = k->needs_whiteout;
-		k->needs_whiteout = false;
-
-		if (k >= btree_bset_last(b)->start) {
-			clobber_u64s = k->u64s;
-			goto overwrite;
-		} else {
-			bch2_btree_path_fix_key_modified(trans, b, k);
-		}
-	}
-
-	k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
-overwrite:
-	bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
-	new_u64s = k->u64s;
-fix_iter:
-	if (clobber_u64s != new_u64s)
-		bch2_btree_node_iter_fix(trans, path, b, node_iter, k,
-					 clobber_u64s, new_u64s);
-	return true;
-}
-
-static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
-			       unsigned i, u64 seq)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct btree_write *w = container_of(pin, struct btree_write, journal);
-	struct btree *b = container_of(w, struct btree, writes[i]);
-	struct btree_trans *trans = bch2_trans_get(c);
-	unsigned long old, new, v;
-	unsigned idx = w - b->writes;
-
-	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-	v = READ_ONCE(b->flags);
-
-	do {
-		old = new = v;
-
-		if (!(old & (1 << BTREE_NODE_dirty)) ||
-		    !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
-		    w->journal.seq != seq)
-			break;
-
-		new &= ~BTREE_WRITE_TYPE_MASK;
-		new |= BTREE_WRITE_journal_reclaim;
-		new |= 1 << BTREE_NODE_need_write;
-	} while ((v = cmpxchg(&b->flags, old, new)) != old);
-
-	btree_node_write_if_need(c, b, SIX_LOCK_read);
-	six_unlock_read(&b->c.lock);
-
-	bch2_trans_put(trans);
-	return 0;
-}
-
-int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
-{
-	return __btree_node_flush(j, pin, 0, seq);
-}
-
-int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
-{
-	return __btree_node_flush(j, pin, 1, seq);
-}
-
-inline void bch2_btree_add_journal_pin(struct bch_fs *c,
-				       struct btree *b, u64 seq)
-{
-	struct btree_write *w = btree_current_write(b);
-
-	bch2_journal_pin_add(&c->journal, seq, &w->journal,
-			     btree_node_write_idx(b) == 0
-			     ? bch2_btree_node_flush0
-			     : bch2_btree_node_flush1);
-}
-
-/**
- * bch2_btree_insert_key_leaf() - insert a key one key into a leaf node
- * @trans:		btree transaction object
- * @path:		path pointing to @insert's pos
- * @insert:		key to insert
- * @journal_seq:	sequence number of journal reservation
- */
-inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
-				       struct btree_path *path,
-				       struct bkey_i *insert,
-				       u64 journal_seq)
-{
-	struct bch_fs *c = trans->c;
-	struct btree *b = path_l(path)->b;
-	struct bset_tree *t = bset_tree_last(b);
-	struct bset *i = bset(b, t);
-	int old_u64s = bset_u64s(t);
-	int old_live_u64s = b->nr.live_u64s;
-	int live_u64s_added, u64s_added;
-
-	if (unlikely(!bch2_btree_bset_insert_key(trans, path, b,
-					&path_l(path)->iter, insert)))
-		return;
-
-	i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq)));
-
-	bch2_btree_add_journal_pin(c, b, journal_seq);
-
-	if (unlikely(!btree_node_dirty(b))) {
-		EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
-		set_btree_node_dirty_acct(c, b);
-	}
-
-	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
-	u64s_added = (int) bset_u64s(t) - old_u64s;
-
-	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
-		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
-	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
-		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
-
-	if (u64s_added > live_u64s_added &&
-	    bch2_maybe_compact_whiteouts(c, b))
-		bch2_trans_node_reinit_iter(trans, b);
-}
-
-/* Cached btree updates: */
-
-/* Normal update interface: */
-
-static inline void btree_insert_entry_checks(struct btree_trans *trans,
-					     struct btree_insert_entry *i)
-{
-	struct btree_path *path = trans->paths + i->path;
-
-	BUG_ON(!bpos_eq(i->k->k.p, path->pos));
-	BUG_ON(i->cached	!= path->cached);
-	BUG_ON(i->level		!= path->level);
-	BUG_ON(i->btree_id	!= path->btree_id);
-	EBUG_ON(!i->level &&
-		btree_type_has_snapshots(i->btree_id) &&
-		!(i->flags & BTREE_UPDATE_internal_snapshot_node) &&
-		test_bit(JOURNAL_replay_done, &trans->c->journal.flags) &&
-		i->k->k.p.snapshot &&
-		bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0);
-}
-
-static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
-						      unsigned flags)
-{
-	return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
-				    trans->journal_u64s, flags);
-}
-
-#define JSET_ENTRY_LOG_U64s		4
-
-static noinline void journal_transaction_name(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-	struct journal *j = &c->journal;
-	struct jset_entry *entry =
-		bch2_journal_add_entry(j, &trans->journal_res,
-				       BCH_JSET_ENTRY_log, 0, 0,
-				       JSET_ENTRY_LOG_U64s);
-	struct jset_entry_log *l =
-		container_of(entry, struct jset_entry_log, entry);
-
-	strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64));
-}
-
-static inline int btree_key_can_insert(struct btree_trans *trans,
-				       struct btree *b, unsigned u64s)
-{
-	if (!bch2_btree_node_insert_fits(b, u64s))
-		return -BCH_ERR_btree_insert_btree_node_full;
-
-	return 0;
-}
-
-noinline static int
-btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
-				     struct btree_path *path, unsigned new_u64s)
-{
-	struct bkey_cached *ck = (void *) path->l[0].b;
-	struct bkey_i *new_k;
-	int ret;
-
-	bch2_trans_unlock_write(trans);
-	bch2_trans_unlock(trans);
-
-	new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
-	if (!new_k) {
-		bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
-			bch2_btree_id_str(path->btree_id), new_u64s);
-		return -BCH_ERR_ENOMEM_btree_key_cache_insert;
-	}
-
-	ret =   bch2_trans_relock(trans) ?:
-		bch2_trans_lock_write(trans);
-	if (unlikely(ret)) {
-		kfree(new_k);
-		return ret;
-	}
-
-	memcpy(new_k, ck->k, ck->u64s * sizeof(u64));
-
-	trans_for_each_update(trans, i)
-		if (i->old_v == &ck->k->v)
-			i->old_v = &new_k->v;
-
-	kfree(ck->k);
-	ck->u64s	= new_u64s;
-	ck->k		= new_k;
-	return 0;
-}
-
-static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
-				       struct btree_path *path, unsigned u64s)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_cached *ck = (void *) path->l[0].b;
-	unsigned new_u64s;
-	struct bkey_i *new_k;
-	unsigned watermark = flags & BCH_WATERMARK_MASK;
-
-	EBUG_ON(path->level);
-
-	if (watermark < BCH_WATERMARK_reclaim &&
-	    !test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
-	    bch2_btree_key_cache_must_wait(c))
-		return -BCH_ERR_btree_insert_need_journal_reclaim;
-
-	/*
-	 * bch2_varint_decode can read past the end of the buffer by at most 7
-	 * bytes (it won't be used):
-	 */
-	u64s += 1;
-
-	if (u64s <= ck->u64s)
-		return 0;
-
-	new_u64s	= roundup_pow_of_two(u64s);
-	new_k		= krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
-	if (unlikely(!new_k))
-		return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
-
-	trans_for_each_update(trans, i)
-		if (i->old_v == &ck->k->v)
-			i->old_v = &new_k->v;
-
-	ck->u64s	= new_u64s;
-	ck->k		= new_k;
-	return 0;
-}
-
-/* Triggers: */
-
-static int run_one_mem_trigger(struct btree_trans *trans,
-			       struct btree_insert_entry *i,
-			       unsigned flags)
-{
-	struct bkey_s_c old = { &i->old_k, i->old_v };
-	struct bkey_i *new = i->k;
-	const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
-	const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
-	int ret;
-
-	verify_update_old_key(trans, i);
-
-	if (unlikely(flags & BTREE_TRIGGER_norun))
-		return 0;
-
-	if (old_ops->trigger == new_ops->trigger) {
-		ret   = bch2_key_trigger(trans, i->btree_id, i->level,
-				old, bkey_i_to_s(new),
-				BTREE_TRIGGER_insert|BTREE_TRIGGER_overwrite|flags);
-	} else {
-		ret   = bch2_key_trigger_new(trans, i->btree_id, i->level,
-				bkey_i_to_s(new), flags) ?:
-			bch2_key_trigger_old(trans, i->btree_id, i->level,
-				old, flags);
-	}
-
-	return ret;
-}
-
-static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
-				 bool overwrite)
-{
-	/*
-	 * Transactional triggers create new btree_insert_entries, so we can't
-	 * pass them a pointer to a btree_insert_entry, that memory is going to
-	 * move:
-	 */
-	struct bkey old_k = i->old_k;
-	struct bkey_s_c old = { &old_k, i->old_v };
-	const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
-	const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
-	unsigned flags = i->flags|BTREE_TRIGGER_transactional;
-
-	verify_update_old_key(trans, i);
-
-	if ((i->flags & BTREE_TRIGGER_norun) ||
-	    !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
-		return 0;
-
-	if (!i->insert_trigger_run &&
-	    !i->overwrite_trigger_run &&
-	    old_ops->trigger == new_ops->trigger) {
-		i->overwrite_trigger_run = true;
-		i->insert_trigger_run = true;
-		return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k),
-					BTREE_TRIGGER_insert|
-					BTREE_TRIGGER_overwrite|flags) ?: 1;
-	} else if (overwrite && !i->overwrite_trigger_run) {
-		i->overwrite_trigger_run = true;
-		return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1;
-	} else if (!overwrite && !i->insert_trigger_run) {
-		i->insert_trigger_run = true;
-		return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1;
-	} else {
-		return 0;
-	}
-}
-
-static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
-			      unsigned btree_id_start)
-{
-	bool trans_trigger_run;
-	int ret, overwrite;
-
-	for (overwrite = 1; overwrite >= 0; --overwrite) {
-
-		/*
-		 * Running triggers will append more updates to the list of updates as
-		 * we're walking it:
-		 */
-		do {
-			trans_trigger_run = false;
-
-			for (unsigned i = btree_id_start;
-			     i < trans->nr_updates && trans->updates[i].btree_id <= btree_id;
-			     i++) {
-				if (trans->updates[i].btree_id != btree_id)
-					continue;
-
-				ret = run_one_trans_trigger(trans, trans->updates + i, overwrite);
-				if (ret < 0)
-					return ret;
-				if (ret)
-					trans_trigger_run = true;
-			}
-		} while (trans_trigger_run);
-	}
-
-	return 0;
-}
-
-static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
-{
-	unsigned btree_id = 0, btree_id_start = 0;
-	int ret = 0;
-
-	/*
-	 *
-	 * For a given btree, this algorithm runs insert triggers before
-	 * overwrite triggers: this is so that when extents are being moved
-	 * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
-	 * they are re-added.
-	 */
-	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
-		if (btree_id == BTREE_ID_alloc)
-			continue;
-
-		while (btree_id_start < trans->nr_updates &&
-		       trans->updates[btree_id_start].btree_id < btree_id)
-			btree_id_start++;
-
-		ret = run_btree_triggers(trans, btree_id, btree_id_start);
-		if (ret)
-			return ret;
-	}
-
-	for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
-		struct btree_insert_entry *i = trans->updates + idx;
-
-		if (i->btree_id > BTREE_ID_alloc)
-			break;
-		if (i->btree_id == BTREE_ID_alloc) {
-			ret = run_btree_triggers(trans, BTREE_ID_alloc, idx);
-			if (ret)
-				return ret;
-			break;
-		}
-	}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-	trans_for_each_update(trans, i)
-		BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
-		       (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
-		       (!i->insert_trigger_run || !i->overwrite_trigger_run));
-#endif
-	return 0;
-}
-
-static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
-{
-	trans_for_each_update(trans, i) {
-		/*
-		 * XXX: synchronization of cached update triggers with gc
-		 * XXX: synchronization of interior node updates with gc
-		 */
-		BUG_ON(i->cached || i->level);
-
-		if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) &&
-		    gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) {
-			int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_gc);
-			if (ret)
-				return ret;
-		}
-	}
-
-	return 0;
-}
-
-static inline int
-bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
-			       struct btree_insert_entry **stopped_at,
-			       unsigned long trace_ip)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_trans_commit_hook *h;
-	unsigned u64s = 0;
-	int ret;
-
-	bch2_trans_verify_not_unlocked(trans);
-	bch2_trans_verify_not_in_restart(trans);
-
-	if (race_fault()) {
-		trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
-		return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
-	}
-
-	/*
-	 * Check if the insert will fit in the leaf node with the write lock
-	 * held, otherwise another thread could write the node changing the
-	 * amount of space available:
-	 */
-
-	prefetch(&trans->c->journal.flags);
-
-	trans_for_each_update(trans, i) {
-		/* Multiple inserts might go to same leaf: */
-		if (!same_leaf_as_prev(trans, i))
-			u64s = 0;
-
-		u64s += i->k->k.u64s;
-		ret = !i->cached
-			? btree_key_can_insert(trans, insert_l(trans, i)->b, u64s)
-			: btree_key_can_insert_cached(trans, flags, trans->paths + i->path, u64s);
-		if (ret) {
-			*stopped_at = i;
-			return ret;
-		}
-
-		i->k->k.needs_whiteout = false;
-	}
-
-	/*
-	 * Don't get journal reservation until after we know insert will
-	 * succeed:
-	 */
-	if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
-		ret = bch2_trans_journal_res_get(trans,
-				(flags & BCH_WATERMARK_MASK)|
-				JOURNAL_RES_GET_NONBLOCK);
-		if (ret)
-			return ret;
-
-		if (unlikely(trans->journal_transaction_names))
-			journal_transaction_name(trans);
-	}
-
-	/*
-	 * Not allowed to fail after we've gotten our journal reservation - we
-	 * have to use it:
-	 */
-
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
-	    !(flags & BCH_TRANS_COMMIT_no_journal_res)) {
-		if (bch2_journal_seq_verify)
-			trans_for_each_update(trans, i)
-				i->k->k.version.lo = trans->journal_res.seq;
-		else if (bch2_inject_invalid_keys)
-			trans_for_each_update(trans, i)
-				i->k->k.version = MAX_VERSION;
-	}
-
-	if (trans->fs_usage_deltas &&
-	    bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
-		return -BCH_ERR_btree_insert_need_mark_replicas;
-
-	/* XXX: we only want to run this if deltas are nonzero */
-	bch2_trans_account_disk_usage_change(trans);
-
-	h = trans->hooks;
-	while (h) {
-		ret = h->fn(trans, h);
-		if (ret)
-			goto revert_fs_usage;
-		h = h->next;
-	}
-
-	trans_for_each_update(trans, i)
-		if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) {
-			ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_atomic|i->flags);
-			if (ret)
-				goto fatal_err;
-		}
-
-	if (unlikely(c->gc_pos.phase)) {
-		ret = bch2_trans_commit_run_gc_triggers(trans);
-		if  (ret)
-			goto fatal_err;
-	}
-
-	if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
-		struct journal *j = &c->journal;
-		struct jset_entry *entry;
-
-		trans_for_each_update(trans, i) {
-			if (i->key_cache_already_flushed)
-				continue;
-
-			if (i->flags & BTREE_UPDATE_nojournal)
-				continue;
-
-			verify_update_old_key(trans, i);
-
-			if (trans->journal_transaction_names) {
-				entry = bch2_journal_add_entry(j, &trans->journal_res,
-						       BCH_JSET_ENTRY_overwrite,
-						       i->btree_id, i->level,
-						       i->old_k.u64s);
-				bkey_reassemble((struct bkey_i *) entry->start,
-						(struct bkey_s_c) { &i->old_k, i->old_v });
-			}
-
-			entry = bch2_journal_add_entry(j, &trans->journal_res,
-					       BCH_JSET_ENTRY_btree_keys,
-					       i->btree_id, i->level,
-					       i->k->k.u64s);
-			bkey_copy((struct bkey_i *) entry->start, i->k);
-		}
-
-		memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
-				  trans->journal_entries,
-				  trans->journal_entries_u64s);
-
-		trans->journal_res.offset	+= trans->journal_entries_u64s;
-		trans->journal_res.u64s		-= trans->journal_entries_u64s;
-
-		if (trans->journal_seq)
-			*trans->journal_seq = trans->journal_res.seq;
-	}
-
-	trans_for_each_update(trans, i) {
-		struct btree_path *path = trans->paths + i->path;
-
-		if (!i->cached) {
-			bch2_btree_insert_key_leaf(trans, path, i->k, trans->journal_res.seq);
-		} else if (!i->key_cache_already_flushed)
-			bch2_btree_insert_key_cached(trans, flags, i);
-		else {
-			bch2_btree_key_cache_drop(trans, path);
-			btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-		}
-	}
-
-	return 0;
-fatal_err:
-	bch2_fatal_error(c);
-revert_fs_usage:
-	if (trans->fs_usage_deltas)
-		bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas);
-	return ret;
-}
-
-static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
-{
-	trans_for_each_update(trans, i)
-		bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
-}
-
-static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
-						   enum bch_validate_flags flags,
-						   struct btree_insert_entry *i,
-						   struct printbuf *err)
-{
-	struct bch_fs *c = trans->c;
-
-	printbuf_reset(err);
-	prt_printf(err, "invalid bkey on insert from %s -> %ps\n",
-		   trans->fn, (void *) i->ip_allocated);
-	printbuf_indent_add(err, 2);
-
-	bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k));
-	prt_newline(err);
-
-	bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, flags, err);
-	bch2_print_string_as_lines(KERN_ERR, err->buf);
-
-	bch2_inconsistent_error(c);
-	bch2_dump_trans_updates(trans);
-
-	return -EINVAL;
-}
-
-static noinline int bch2_trans_commit_journal_entry_invalid(struct btree_trans *trans,
-						   struct jset_entry *i)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-
-	prt_printf(&buf, "invalid bkey on insert from %s\n", trans->fn);
-	printbuf_indent_add(&buf, 2);
-
-	bch2_journal_entry_to_text(&buf, c, i);
-	prt_newline(&buf);
-
-	bch2_print_string_as_lines(KERN_ERR, buf.buf);
-
-	bch2_inconsistent_error(c);
-	bch2_dump_trans_updates(trans);
-
-	return -EINVAL;
-}
-
-static int bch2_trans_commit_journal_pin_flush(struct journal *j,
-				struct journal_entry_pin *_pin, u64 seq)
-{
-	return 0;
-}
-
-/*
- * Get journal reservation, take write locks, and attempt to do btree update(s):
- */
-static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags,
-				       struct btree_insert_entry **stopped_at,
-				       unsigned long trace_ip)
-{
-	struct bch_fs *c = trans->c;
-	int ret = 0, u64s_delta = 0;
-
-	for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
-		struct btree_insert_entry *i = trans->updates + idx;
-		if (i->cached)
-			continue;
-
-		u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
-		u64s_delta -= i->old_btree_u64s;
-
-		if (!same_leaf_as_next(trans, i)) {
-			if (u64s_delta <= 0) {
-				ret = bch2_foreground_maybe_merge(trans, i->path,
-							i->level, flags);
-				if (unlikely(ret))
-					return ret;
-			}
-
-			u64s_delta = 0;
-		}
-	}
-
-	ret = bch2_trans_lock_write(trans);
-	if (unlikely(ret))
-		return ret;
-
-	ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip);
-
-	if (!ret && unlikely(trans->journal_replay_not_finished))
-		bch2_drop_overwrites_from_journal(trans);
-
-	bch2_trans_unlock_write(trans);
-
-	if (!ret && trans->journal_pin)
-		bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
-				     trans->journal_pin,
-				     bch2_trans_commit_journal_pin_flush);
-
-	/*
-	 * Drop journal reservation after dropping write locks, since dropping
-	 * the journal reservation may kick off a journal write:
-	 */
-	if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
-		bch2_journal_res_put(&c->journal, &trans->journal_res);
-
-	return ret;
-}
-
-static int journal_reclaim_wait_done(struct bch_fs *c)
-{
-	int ret = bch2_journal_error(&c->journal) ?:
-		!bch2_btree_key_cache_must_wait(c);
-
-	if (!ret)
-		journal_reclaim_kick(&c->journal);
-	return ret;
-}
-
-static noinline
-int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
-			    struct btree_insert_entry *i,
-			    int ret, unsigned long trace_ip)
-{
-	struct bch_fs *c = trans->c;
-	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
-
-	switch (ret) {
-	case -BCH_ERR_btree_insert_btree_node_full:
-		ret = bch2_btree_split_leaf(trans, i->path, flags);
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			trace_and_count(c, trans_restart_btree_node_split, trans,
-					trace_ip, trans->paths + i->path);
-		break;
-	case -BCH_ERR_btree_insert_need_mark_replicas:
-		ret = drop_locks_do(trans,
-			bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas));
-		break;
-	case -BCH_ERR_journal_res_get_blocked:
-		/*
-		 * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
-		 * flag
-		 */
-		if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
-		    watermark < BCH_WATERMARK_reclaim) {
-			ret = -BCH_ERR_journal_reclaim_would_deadlock;
-			break;
-		}
-
-		ret = drop_locks_do(trans,
-			bch2_trans_journal_res_get(trans,
-					(flags & BCH_WATERMARK_MASK)|
-					JOURNAL_RES_GET_CHECK));
-		break;
-	case -BCH_ERR_btree_insert_need_journal_reclaim:
-		bch2_trans_unlock(trans);
-
-		trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
-
-		wait_event_freezable(c->journal.reclaim_wait,
-				     (ret = journal_reclaim_wait_done(c)));
-		if (ret < 0)
-			break;
-
-		ret = bch2_trans_relock(trans);
-		break;
-	default:
-		BUG_ON(ret >= 0);
-		break;
-	}
-
-	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
-
-	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
-				(flags & BCH_TRANS_COMMIT_no_enospc), c,
-		"%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
-
-	return ret;
-}
-
-static noinline int
-bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	int ret;
-
-	if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) ||
-	    test_bit(BCH_FS_started, &c->flags))
-		return -BCH_ERR_erofs_trans_commit;
-
-	ret = drop_locks_do(trans, bch2_fs_read_write_early(c));
-	if (ret)
-		return ret;
-
-	bch2_write_ref_get(c, BCH_WRITE_REF_trans);
-	return 0;
-}
-
-/*
- * This is for updates done in the early part of fsck - btree_gc - before we've
- * gone RW. we only add the new key to the list of keys for journal replay to
- * do.
- */
-static noinline int
-do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-	int ret = 0;
-
-	trans_for_each_update(trans, i) {
-		ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
-		if (ret)
-			break;
-	}
-
-	return ret;
-}
-
-int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
-{
-	struct btree_insert_entry *errored_at = NULL;
-	struct bch_fs *c = trans->c;
-	int ret = 0;
-
-	bch2_trans_verify_not_unlocked(trans);
-	bch2_trans_verify_not_in_restart(trans);
-
-	if (!trans->nr_updates &&
-	    !trans->journal_entries_u64s)
-		goto out_reset;
-
-	memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
-
-	ret = bch2_trans_commit_run_triggers(trans);
-	if (ret)
-		goto out_reset;
-
-	trans_for_each_update(trans, i) {
-		struct printbuf buf = PRINTBUF;
-		enum bch_validate_flags invalid_flags = 0;
-
-		if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
-			invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
-
-		if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
-					       i->bkey_type, invalid_flags, &buf)))
-			ret = bch2_trans_commit_bkey_invalid(trans, invalid_flags, i, &buf);
-		btree_insert_entry_checks(trans, i);
-		printbuf_exit(&buf);
-
-		if (ret)
-			return ret;
-	}
-
-	for (struct jset_entry *i = trans->journal_entries;
-	     i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
-	     i = vstruct_next(i)) {
-		enum bch_validate_flags invalid_flags = 0;
-
-		if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
-			invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
-
-		if (unlikely(bch2_journal_entry_validate(c, NULL, i,
-					bcachefs_metadata_version_current,
-					CPU_BIG_ENDIAN, invalid_flags)))
-			ret = bch2_trans_commit_journal_entry_invalid(trans, i);
-
-		if (ret)
-			return ret;
-	}
-
-	if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) {
-		ret = do_bch2_trans_commit_to_journal_replay(trans);
-		goto out_reset;
-	}
-
-	if (!(flags & BCH_TRANS_COMMIT_no_check_rw) &&
-	    unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
-		ret = bch2_trans_commit_get_rw_cold(trans, flags);
-		if (ret)
-			goto out_reset;
-	}
-
-	EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
-
-	trans->journal_u64s		= trans->journal_entries_u64s;
-	trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
-	if (trans->journal_transaction_names)
-		trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
-
-	trans_for_each_update(trans, i) {
-		struct btree_path *path = trans->paths + i->path;
-
-		EBUG_ON(!path->should_be_locked);
-
-		ret = bch2_btree_path_upgrade(trans, path, i->level + 1);
-		if (unlikely(ret))
-			goto out;
-
-		EBUG_ON(!btree_node_intent_locked(path, i->level));
-
-		if (i->key_cache_already_flushed)
-			continue;
-
-		if (i->flags & BTREE_UPDATE_nojournal)
-			continue;
-
-		/* we're going to journal the key being updated: */
-		trans->journal_u64s += jset_u64s(i->k->k.u64s);
-
-		/* and we're also going to log the overwrite: */
-		if (trans->journal_transaction_names)
-			trans->journal_u64s += jset_u64s(i->old_k.u64s);
-	}
-
-	if (trans->extra_disk_res) {
-		ret = bch2_disk_reservation_add(c, trans->disk_res,
-				trans->extra_disk_res,
-				(flags & BCH_TRANS_COMMIT_no_enospc)
-				? BCH_DISK_RESERVATION_NOFAIL : 0);
-		if (ret)
-			goto err;
-	}
-retry:
-	errored_at = NULL;
-	bch2_trans_verify_not_unlocked(trans);
-	bch2_trans_verify_not_in_restart(trans);
-	if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
-		memset(&trans->journal_res, 0, sizeof(trans->journal_res));
-
-	ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_);
-
-	/* make sure we didn't drop or screw up locks: */
-	bch2_trans_verify_locks(trans);
-
-	if (ret)
-		goto err;
-
-	trace_and_count(c, transaction_commit, trans, _RET_IP_);
-out:
-	if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw)))
-		bch2_write_ref_put(c, BCH_WRITE_REF_trans);
-out_reset:
-	if (!ret)
-		bch2_trans_downgrade(trans);
-	bch2_trans_reset_updates(trans);
-
-	return ret;
-err:
-	ret = bch2_trans_commit_error(trans, flags, errored_at, ret, _RET_IP_);
-	if (ret)
-		goto out;
-
-	/*
-	 * We might have done another transaction commit in the error path -
-	 * i.e. btree write buffer flush - which will have made use of
-	 * trans->journal_res, but with BCH_TRANS_COMMIT_no_journal_res that is
-	 * how the journal sequence number to pin is passed in - so we must
-	 * restart:
-	 */
-	if (flags & BCH_TRANS_COMMIT_no_journal_res) {
-		ret = -BCH_ERR_transaction_restart_nested;
-		goto out;
-	}
-
-	goto retry;
-}
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
deleted file mode 100644
index d63db4fefe73..000000000000
--- a/fs/bcachefs/btree_types.h
+++ /dev/null
@@ -1,837 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_TYPES_H
-#define _BCACHEFS_BTREE_TYPES_H
-
-#include <linux/list.h>
-#include <linux/rhashtable.h>
-
-#include "bbpos_types.h"
-#include "btree_key_cache_types.h"
-#include "buckets_types.h"
-#include "darray.h"
-#include "errcode.h"
-#include "journal_types.h"
-#include "replicas_types.h"
-#include "six.h"
-
-struct open_bucket;
-struct btree_update;
-struct btree_trans;
-
-#define MAX_BSETS		3U
-
-struct btree_nr_keys {
-
-	/*
-	 * Amount of live metadata (i.e. size of node after a compaction) in
-	 * units of u64s
-	 */
-	u16			live_u64s;
-	u16			bset_u64s[MAX_BSETS];
-
-	/* live keys only: */
-	u16			packed_keys;
-	u16			unpacked_keys;
-};
-
-struct bset_tree {
-	/*
-	 * We construct a binary tree in an array as if the array
-	 * started at 1, so that things line up on the same cachelines
-	 * better: see comments in bset.c at cacheline_to_bkey() for
-	 * details
-	 */
-
-	/* size of the binary tree and prev array */
-	u16			size;
-
-	/* function of size - precalculated for to_inorder() */
-	u16			extra;
-
-	u16			data_offset;
-	u16			aux_data_offset;
-	u16			end_offset;
-};
-
-struct btree_write {
-	struct journal_entry_pin	journal;
-};
-
-struct btree_alloc {
-	struct open_buckets	ob;
-	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX);
-};
-
-struct btree_bkey_cached_common {
-	struct six_lock		lock;
-	u8			level;
-	u8			btree_id;
-	bool			cached;
-};
-
-struct btree {
-	struct btree_bkey_cached_common c;
-
-	struct rhash_head	hash;
-	u64			hash_val;
-
-	unsigned long		flags;
-	u16			written;
-	u8			nsets;
-	u8			nr_key_bits;
-	u16			version_ondisk;
-
-	struct bkey_format	format;
-
-	struct btree_node	*data;
-	void			*aux_data;
-
-	/*
-	 * Sets of sorted keys - the real btree node - plus a binary search tree
-	 *
-	 * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
-	 * to the memory we have allocated for this btree node. Additionally,
-	 * set[0]->data points to the entire btree node as it exists on disk.
-	 */
-	struct bset_tree	set[MAX_BSETS];
-
-	struct btree_nr_keys	nr;
-	u16			sib_u64s[2];
-	u16			whiteout_u64s;
-	u8			byte_order;
-	u8			unpack_fn_len;
-
-	struct btree_write	writes[2];
-
-	/* Key/pointer for this btree node */
-	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
-
-	/*
-	 * XXX: add a delete sequence number, so when bch2_btree_node_relock()
-	 * fails because the lock sequence number has changed - i.e. the
-	 * contents were modified - we can still relock the node if it's still
-	 * the one we want, without redoing the traversal
-	 */
-
-	/*
-	 * For asynchronous splits/interior node updates:
-	 * When we do a split, we allocate new child nodes and update the parent
-	 * node to point to them: we update the parent in memory immediately,
-	 * but then we must wait until the children have been written out before
-	 * the update to the parent can be written - this is a list of the
-	 * btree_updates that are blocking this node from being
-	 * written:
-	 */
-	struct list_head	write_blocked;
-
-	/*
-	 * Also for asynchronous splits/interior node updates:
-	 * If a btree node isn't reachable yet, we don't want to kick off
-	 * another write - because that write also won't yet be reachable and
-	 * marking it as completed before it's reachable would be incorrect:
-	 */
-	unsigned long		will_make_reachable;
-
-	struct open_buckets	ob;
-
-	/* lru list */
-	struct list_head	list;
-};
-
-struct btree_cache {
-	struct rhashtable	table;
-	bool			table_init_done;
-	/*
-	 * We never free a struct btree, except on shutdown - we just put it on
-	 * the btree_cache_freed list and reuse it later. This simplifies the
-	 * code, and it doesn't cost us much memory as the memory usage is
-	 * dominated by buffers that hold the actual btree node data and those
-	 * can be freed - and the number of struct btrees allocated is
-	 * effectively bounded.
-	 *
-	 * btree_cache_freeable effectively is a small cache - we use it because
-	 * high order page allocations can be rather expensive, and it's quite
-	 * common to delete and allocate btree nodes in quick succession. It
-	 * should never grow past ~2-3 nodes in practice.
-	 */
-	struct mutex		lock;
-	struct list_head	live;
-	struct list_head	freeable;
-	struct list_head	freed_pcpu;
-	struct list_head	freed_nonpcpu;
-
-	/* Number of elements in live + freeable lists */
-	unsigned		used;
-	unsigned		reserve;
-	unsigned		freed;
-	unsigned		not_freed_lock_intent;
-	unsigned		not_freed_lock_write;
-	unsigned		not_freed_dirty;
-	unsigned		not_freed_read_in_flight;
-	unsigned		not_freed_write_in_flight;
-	unsigned		not_freed_noevict;
-	unsigned		not_freed_write_blocked;
-	unsigned		not_freed_will_make_reachable;
-	unsigned		not_freed_access_bit;
-	atomic_t		dirty;
-	struct shrinker		*shrink;
-
-	unsigned		used_by_btree[BTREE_ID_NR];
-
-	/*
-	 * If we need to allocate memory for a new btree node and that
-	 * allocation fails, we can cannibalize another node in the btree cache
-	 * to satisfy the allocation - lock to guarantee only one thread does
-	 * this at a time:
-	 */
-	struct task_struct	*alloc_lock;
-	struct closure_waitlist	alloc_wait;
-
-	struct bbpos		pinned_nodes_start;
-	struct bbpos		pinned_nodes_end;
-	u64			pinned_nodes_leaf_mask;
-	u64			pinned_nodes_interior_mask;
-};
-
-struct btree_node_iter {
-	struct btree_node_iter_set {
-		u16	k, end;
-	} data[MAX_BSETS];
-};
-
-#define BTREE_ITER_FLAGS()			\
-	x(slots)				\
-	x(intent)				\
-	x(prefetch)				\
-	x(is_extents)				\
-	x(not_extents)				\
-	x(cached)				\
-	x(with_key_cache)			\
-	x(with_updates)				\
-	x(with_journal)				\
-	x(snapshot_field)			\
-	x(all_snapshots)			\
-	x(filter_snapshots)			\
-	x(nopreserve)				\
-	x(cached_nofill)			\
-	x(key_cache_fill)			\
-
-#define STR_HASH_FLAGS()			\
-	x(must_create)				\
-	x(must_replace)
-
-#define BTREE_UPDATE_FLAGS()			\
-	x(internal_snapshot_node)		\
-	x(nojournal)				\
-	x(key_cache_reclaim)
-
-
-/*
- * BTREE_TRIGGER_norun - don't run triggers at all
- *
- * BTREE_TRIGGER_transactional - we're running transactional triggers as part of
- * a transaction commit: triggers may generate new updates
- *
- * BTREE_TRIGGER_atomic - we're running atomic triggers during a transaction
- * commit: we have our journal reservation, we're holding btree node write
- * locks, and we know the transaction is going to commit (returning an error
- * here is a fatal error, causing us to go emergency read-only)
- *
- * BTREE_TRIGGER_gc - we're in gc/fsck: running triggers to recalculate e.g. disk usage
- *
- * BTREE_TRIGGER_insert - @new is entering the btree
- * BTREE_TRIGGER_overwrite - @old is leaving the btree
- *
- * BTREE_TRIGGER_bucket_invalidate - signal from bucket invalidate path to alloc
- * trigger
- */
-#define BTREE_TRIGGER_FLAGS()			\
-	x(norun)				\
-	x(transactional)			\
-	x(atomic)				\
-	x(check_repair)				\
-	x(gc)					\
-	x(insert)				\
-	x(overwrite)				\
-	x(is_root)				\
-	x(bucket_invalidate)
-
-enum {
-#define x(n) BTREE_ITER_FLAG_BIT_##n,
-	BTREE_ITER_FLAGS()
-	STR_HASH_FLAGS()
-	BTREE_UPDATE_FLAGS()
-	BTREE_TRIGGER_FLAGS()
-#undef x
-};
-
-/* iter flags must fit in a u16: */
-//BUILD_BUG_ON(BTREE_ITER_FLAG_BIT_key_cache_fill > 15);
-
-enum btree_iter_update_trigger_flags {
-#define x(n) BTREE_ITER_##n	= 1U << BTREE_ITER_FLAG_BIT_##n,
-	BTREE_ITER_FLAGS()
-#undef x
-#define x(n) STR_HASH_##n	= 1U << BTREE_ITER_FLAG_BIT_##n,
-	STR_HASH_FLAGS()
-#undef x
-#define x(n) BTREE_UPDATE_##n	= 1U << BTREE_ITER_FLAG_BIT_##n,
-	BTREE_UPDATE_FLAGS()
-#undef x
-#define x(n) BTREE_TRIGGER_##n	= 1U << BTREE_ITER_FLAG_BIT_##n,
-	BTREE_TRIGGER_FLAGS()
-#undef x
-};
-
-enum btree_path_uptodate {
-	BTREE_ITER_UPTODATE		= 0,
-	BTREE_ITER_NEED_RELOCK		= 1,
-	BTREE_ITER_NEED_TRAVERSE	= 2,
-};
-
-#if defined(CONFIG_BCACHEFS_LOCK_TIME_STATS) || defined(CONFIG_BCACHEFS_DEBUG)
-#define TRACK_PATH_ALLOCATED
-#endif
-
-typedef u16 btree_path_idx_t;
-
-struct btree_path {
-	btree_path_idx_t	sorted_idx;
-	u8			ref;
-	u8			intent_ref;
-
-	/* btree_iter_copy starts here: */
-	struct bpos		pos;
-
-	enum btree_id		btree_id:5;
-	bool			cached:1;
-	bool			preserve:1;
-	enum btree_path_uptodate uptodate:2;
-	/*
-	 * When true, failing to relock this path will cause the transaction to
-	 * restart:
-	 */
-	bool			should_be_locked:1;
-	unsigned		level:3,
-				locks_want:3;
-	u8			nodes_locked;
-
-	struct btree_path_level {
-		struct btree	*b;
-		struct btree_node_iter iter;
-		u32		lock_seq;
-#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
-		u64             lock_taken_time;
-#endif
-	}			l[BTREE_MAX_DEPTH];
-#ifdef TRACK_PATH_ALLOCATED
-	unsigned long		ip_allocated;
-#endif
-};
-
-static inline struct btree_path_level *path_l(struct btree_path *path)
-{
-	return path->l + path->level;
-}
-
-static inline unsigned long btree_path_ip_allocated(struct btree_path *path)
-{
-#ifdef TRACK_PATH_ALLOCATED
-	return path->ip_allocated;
-#else
-	return _THIS_IP_;
-#endif
-}
-
-/*
- * @pos			- iterator's current position
- * @level		- current btree depth
- * @locks_want		- btree level below which we start taking intent locks
- * @nodes_locked	- bitmask indicating which nodes in @nodes are locked
- * @nodes_intent_locked	- bitmask indicating which locks are intent locks
- */
-struct btree_iter {
-	struct btree_trans	*trans;
-	btree_path_idx_t	path;
-	btree_path_idx_t	update_path;
-	btree_path_idx_t	key_cache_path;
-
-	enum btree_id		btree_id:8;
-	u8			min_depth;
-
-	/* btree_iter_copy starts here: */
-	u16			flags;
-
-	/* When we're filtering by snapshot, the snapshot ID we're looking for: */
-	unsigned		snapshot;
-
-	struct bpos		pos;
-	/*
-	 * Current unpacked key - so that bch2_btree_iter_next()/
-	 * bch2_btree_iter_next_slot() can correctly advance pos.
-	 */
-	struct bkey		k;
-
-	/* BTREE_ITER_with_journal: */
-	size_t			journal_idx;
-#ifdef TRACK_PATH_ALLOCATED
-	unsigned long		ip_allocated;
-#endif
-};
-
-#define BKEY_CACHED_ACCESSED		0
-#define BKEY_CACHED_DIRTY		1
-
-struct bkey_cached {
-	struct btree_bkey_cached_common c;
-
-	unsigned long		flags;
-	unsigned long		btree_trans_barrier_seq;
-	u16			u64s;
-	bool			valid;
-	struct bkey_cached_key	key;
-
-	struct rhash_head	hash;
-	struct list_head	list;
-
-	struct journal_entry_pin journal;
-	u64			seq;
-
-	struct bkey_i		*k;
-};
-
-static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
-{
-	return !b->cached
-		? container_of(b, struct btree, c)->key.k.p
-		: container_of(b, struct bkey_cached, c)->key.pos;
-}
-
-struct btree_insert_entry {
-	unsigned		flags;
-	u8			bkey_type;
-	enum btree_id		btree_id:8;
-	u8			level:4;
-	bool			cached:1;
-	bool			insert_trigger_run:1;
-	bool			overwrite_trigger_run:1;
-	bool			key_cache_already_flushed:1;
-	/*
-	 * @old_k may be a key from the journal; @old_btree_u64s always refers
-	 * to the size of the key being overwritten in the btree:
-	 */
-	u8			old_btree_u64s;
-	btree_path_idx_t	path;
-	struct bkey_i		*k;
-	/* key being overwritten: */
-	struct bkey		old_k;
-	const struct bch_val	*old_v;
-	unsigned long		ip_allocated;
-};
-
-/* Number of btree paths we preallocate, usually enough */
-#define BTREE_ITER_INITIAL		64
-/*
- * Lmiit for btree_trans_too_many_iters(); this is enough that almost all code
- * paths should run inside this limit, and if they don't it usually indicates a
- * bug (leaking/duplicated btree paths).
- *
- * exception: some fsck paths
- *
- * bugs with excessive path usage seem to have possibly been eliminated now, so
- * we might consider eliminating this (and btree_trans_too_many_iter()) at some
- * point.
- */
-#define BTREE_ITER_NORMAL_LIMIT		256
-/* never exceed limit */
-#define BTREE_ITER_MAX			(1U << 10)
-
-struct btree_trans_commit_hook;
-typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
-
-struct btree_trans_commit_hook {
-	btree_trans_commit_hook_fn	*fn;
-	struct btree_trans_commit_hook	*next;
-};
-
-#define BTREE_TRANS_MEM_MAX	(1U << 16)
-
-#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS	10000
-
-struct btree_trans_paths {
-	unsigned long		nr_paths;
-	struct btree_path	paths[];
-};
-
-struct btree_trans {
-	struct bch_fs		*c;
-
-	unsigned long		*paths_allocated;
-	struct btree_path	*paths;
-	btree_path_idx_t	*sorted;
-	struct btree_insert_entry *updates;
-
-	void			*mem;
-	unsigned		mem_top;
-	unsigned		mem_bytes;
-
-	btree_path_idx_t	nr_sorted;
-	btree_path_idx_t	nr_paths;
-	btree_path_idx_t	nr_paths_max;
-	u8			fn_idx;
-	u8			nr_updates;
-	u8			lock_must_abort;
-	bool			lock_may_not_fail:1;
-	bool			srcu_held:1;
-	bool			locked:1;
-	bool			write_locked:1;
-	bool			used_mempool:1;
-	bool			in_traverse_all:1;
-	bool			paths_sorted:1;
-	bool			memory_allocation_failure:1;
-	bool			journal_transaction_names:1;
-	bool			journal_replay_not_finished:1;
-	bool			notrace_relock_fail:1;
-	enum bch_errcode	restarted:16;
-	u32			restart_count;
-
-	u64			last_begin_time;
-	unsigned long		last_begin_ip;
-	unsigned long		last_restarted_ip;
-	unsigned long		last_unlock_ip;
-	unsigned long		srcu_lock_time;
-
-	const char		*fn;
-	struct btree_bkey_cached_common *locking;
-	struct six_lock_waiter	locking_wait;
-	int			srcu_idx;
-
-	/* update path: */
-	u16			journal_entries_u64s;
-	u16			journal_entries_size;
-	struct jset_entry	*journal_entries;
-
-	struct btree_trans_commit_hook *hooks;
-	struct journal_entry_pin *journal_pin;
-
-	struct journal_res	journal_res;
-	u64			*journal_seq;
-	struct disk_reservation *disk_res;
-
-	struct bch_fs_usage_base fs_usage_delta;
-
-	unsigned		journal_u64s;
-	unsigned		extra_disk_res; /* XXX kill */
-	struct replicas_delta_list *fs_usage_deltas;
-
-	/* Entries before this are zeroed out on every bch2_trans_get() call */
-
-	struct list_head	list;
-	struct closure		ref;
-
-	unsigned long		_paths_allocated[BITS_TO_LONGS(BTREE_ITER_INITIAL)];
-	struct btree_trans_paths trans_paths;
-	struct btree_path	_paths[BTREE_ITER_INITIAL];
-	btree_path_idx_t	_sorted[BTREE_ITER_INITIAL + 4];
-	struct btree_insert_entry _updates[BTREE_ITER_INITIAL];
-};
-
-static inline struct btree_path *btree_iter_path(struct btree_trans *trans, struct btree_iter *iter)
-{
-	return trans->paths + iter->path;
-}
-
-static inline struct btree_path *btree_iter_key_cache_path(struct btree_trans *trans, struct btree_iter *iter)
-{
-	return iter->key_cache_path
-		? trans->paths + iter->key_cache_path
-		: NULL;
-}
-
-#define BCH_BTREE_WRITE_TYPES()						\
-	x(initial,		0)					\
-	x(init_next_bset,	1)					\
-	x(cache_reclaim,	2)					\
-	x(journal_reclaim,	3)					\
-	x(interior,		4)
-
-enum btree_write_type {
-#define x(t, n) BTREE_WRITE_##t,
-	BCH_BTREE_WRITE_TYPES()
-#undef x
-	BTREE_WRITE_TYPE_NR,
-};
-
-#define BTREE_WRITE_TYPE_MASK	(roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1)
-#define BTREE_WRITE_TYPE_BITS	ilog2(roundup_pow_of_two(BTREE_WRITE_TYPE_NR))
-
-#define BTREE_FLAGS()							\
-	x(read_in_flight)						\
-	x(read_error)							\
-	x(dirty)							\
-	x(need_write)							\
-	x(write_blocked)						\
-	x(will_make_reachable)						\
-	x(noevict)							\
-	x(write_idx)							\
-	x(accessed)							\
-	x(write_in_flight)						\
-	x(write_in_flight_inner)					\
-	x(just_written)							\
-	x(dying)							\
-	x(fake)								\
-	x(need_rewrite)							\
-	x(never_write)
-
-enum btree_flags {
-	/* First bits for btree node write type */
-	BTREE_NODE_FLAGS_START = BTREE_WRITE_TYPE_BITS - 1,
-#define x(flag)	BTREE_NODE_##flag,
-	BTREE_FLAGS()
-#undef x
-};
-
-#define x(flag)								\
-static inline bool btree_node_ ## flag(struct btree *b)			\
-{	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
-									\
-static inline void set_btree_node_ ## flag(struct btree *b)		\
-{	set_bit(BTREE_NODE_ ## flag, &b->flags); }			\
-									\
-static inline void clear_btree_node_ ## flag(struct btree *b)		\
-{	clear_bit(BTREE_NODE_ ## flag, &b->flags); }
-
-BTREE_FLAGS()
-#undef x
-
-static inline struct btree_write *btree_current_write(struct btree *b)
-{
-	return b->writes + btree_node_write_idx(b);
-}
-
-static inline struct btree_write *btree_prev_write(struct btree *b)
-{
-	return b->writes + (btree_node_write_idx(b) ^ 1);
-}
-
-static inline struct bset_tree *bset_tree_last(struct btree *b)
-{
-	EBUG_ON(!b->nsets);
-	return b->set + b->nsets - 1;
-}
-
-static inline void *
-__btree_node_offset_to_ptr(const struct btree *b, u16 offset)
-{
-	return (void *) ((u64 *) b->data + 1 + offset);
-}
-
-static inline u16
-__btree_node_ptr_to_offset(const struct btree *b, const void *p)
-{
-	u16 ret = (u64 *) p - 1 - (u64 *) b->data;
-
-	EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p);
-	return ret;
-}
-
-static inline struct bset *bset(const struct btree *b,
-				const struct bset_tree *t)
-{
-	return __btree_node_offset_to_ptr(b, t->data_offset);
-}
-
-static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
-{
-	t->end_offset =
-		__btree_node_ptr_to_offset(b, vstruct_last(bset(b, t)));
-}
-
-static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
-				  const struct bset *i)
-{
-	t->data_offset = __btree_node_ptr_to_offset(b, i);
-	set_btree_bset_end(b, t);
-}
-
-static inline struct bset *btree_bset_first(struct btree *b)
-{
-	return bset(b, b->set);
-}
-
-static inline struct bset *btree_bset_last(struct btree *b)
-{
-	return bset(b, bset_tree_last(b));
-}
-
-static inline u16
-__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
-{
-	return __btree_node_ptr_to_offset(b, k);
-}
-
-static inline struct bkey_packed *
-__btree_node_offset_to_key(const struct btree *b, u16 k)
-{
-	return __btree_node_offset_to_ptr(b, k);
-}
-
-static inline unsigned btree_bkey_first_offset(const struct bset_tree *t)
-{
-	return t->data_offset + offsetof(struct bset, _data) / sizeof(u64);
-}
-
-#define btree_bkey_first(_b, _t)					\
-({									\
-	EBUG_ON(bset(_b, _t)->start !=					\
-		__btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\
-									\
-	bset(_b, _t)->start;						\
-})
-
-#define btree_bkey_last(_b, _t)						\
-({									\
-	EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) !=	\
-		vstruct_last(bset(_b, _t)));				\
-									\
-	__btree_node_offset_to_key(_b, (_t)->end_offset);		\
-})
-
-static inline unsigned bset_u64s(struct bset_tree *t)
-{
-	return t->end_offset - t->data_offset -
-		sizeof(struct bset) / sizeof(u64);
-}
-
-static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t)
-{
-	return bset_u64s(t) - b->nr.bset_u64s[t - b->set];
-}
-
-static inline unsigned bset_byte_offset(struct btree *b, void *i)
-{
-	return i - (void *) b->data;
-}
-
-enum btree_node_type {
-	BKEY_TYPE_btree,
-#define x(kwd, val, ...) BKEY_TYPE_##kwd = val + 1,
-	BCH_BTREE_IDS()
-#undef x
-	BKEY_TYPE_NR
-};
-
-/* Type of a key in btree @id at level @level: */
-static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
-{
-	return level ? BKEY_TYPE_btree : (unsigned) id + 1;
-}
-
-/* Type of keys @b contains: */
-static inline enum btree_node_type btree_node_type(struct btree *b)
-{
-	return __btree_node_type(b->c.level, b->c.btree_id);
-}
-
-const char *bch2_btree_node_type_str(enum btree_node_type);
-
-#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS		\
-	(BIT_ULL(BKEY_TYPE_extents)|			\
-	 BIT_ULL(BKEY_TYPE_alloc)|			\
-	 BIT_ULL(BKEY_TYPE_inodes)|			\
-	 BIT_ULL(BKEY_TYPE_stripes)|			\
-	 BIT_ULL(BKEY_TYPE_reflink)|			\
-	 BIT_ULL(BKEY_TYPE_subvolumes)|			\
-	 BIT_ULL(BKEY_TYPE_btree))
-
-#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS		\
-	(BIT_ULL(BKEY_TYPE_alloc)|			\
-	 BIT_ULL(BKEY_TYPE_inodes)|			\
-	 BIT_ULL(BKEY_TYPE_stripes)|			\
-	 BIT_ULL(BKEY_TYPE_snapshots))
-
-#define BTREE_NODE_TYPE_HAS_TRIGGERS			\
-	(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|		\
-	 BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS)
-
-static inline bool btree_node_type_needs_gc(enum btree_node_type type)
-{
-	return BTREE_NODE_TYPE_HAS_TRIGGERS & BIT_ULL(type);
-}
-
-static inline bool btree_node_type_is_extents(enum btree_node_type type)
-{
-	const unsigned mask = 0
-#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1))
-	BCH_BTREE_IDS()
-#undef x
-	;
-
-	return (1U << type) & mask;
-}
-
-static inline bool btree_id_is_extents(enum btree_id btree)
-{
-	return btree_node_type_is_extents(__btree_node_type(0, btree));
-}
-
-static inline bool btree_type_has_snapshots(enum btree_id id)
-{
-	const unsigned mask = 0
-#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr)
-	BCH_BTREE_IDS()
-#undef x
-	;
-
-	return (1U << id) & mask;
-}
-
-static inline bool btree_type_has_snapshot_field(enum btree_id id)
-{
-	const unsigned mask = 0
-#define x(name, nr, flags, ...)	|((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr)
-	BCH_BTREE_IDS()
-#undef x
-	;
-
-	return (1U << id) & mask;
-}
-
-static inline bool btree_type_has_ptrs(enum btree_id id)
-{
-	const unsigned mask = 0
-#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_ID_DATA)) << nr)
-	BCH_BTREE_IDS()
-#undef x
-	;
-
-	return (1U << id) & mask;
-}
-
-struct btree_root {
-	struct btree		*b;
-
-	/* On disk root - see async splits: */
-	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
-	u8			level;
-	u8			alive;
-	s16			error;
-};
-
-enum btree_gc_coalesce_fail_reason {
-	BTREE_GC_COALESCE_FAIL_RESERVE_GET,
-	BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
-	BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
-};
-
-enum btree_node_sibling {
-	btree_prev_sib,
-	btree_next_sib,
-};
-
-struct get_locks_fail {
-	unsigned	l;
-	struct btree	*b;
-};
-
-#endif /* _BCACHEFS_BTREE_TYPES_H */
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
deleted file mode 100644
index f3c645a43dcb..000000000000
--- a/fs/bcachefs/btree_update.c
+++ /dev/null
@@ -1,897 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_update.h"
-#include "btree_iter.h"
-#include "btree_journal_iter.h"
-#include "btree_locking.h"
-#include "buckets.h"
-#include "debug.h"
-#include "errcode.h"
-#include "error.h"
-#include "extents.h"
-#include "keylist.h"
-#include "snapshot.h"
-#include "trace.h"
-
-static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
-					 const struct btree_insert_entry *r)
-{
-	return   cmp_int(l->btree_id,	r->btree_id) ?:
-		 cmp_int(l->cached,	r->cached) ?:
-		 -cmp_int(l->level,	r->level) ?:
-		 bpos_cmp(l->k->k.p,	r->k->k.p);
-}
-
-static int __must_check
-bch2_trans_update_by_path(struct btree_trans *, btree_path_idx_t,
-			  struct bkey_i *, enum btree_iter_update_trigger_flags,
-			  unsigned long ip);
-
-static noinline int extent_front_merge(struct btree_trans *trans,
-				       struct btree_iter *iter,
-				       struct bkey_s_c k,
-				       struct bkey_i **insert,
-				       enum btree_iter_update_trigger_flags flags)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_i *update;
-	int ret;
-
-	if (unlikely(trans->journal_replay_not_finished))
-		return 0;
-
-	update = bch2_bkey_make_mut_noupdate(trans, k);
-	ret = PTR_ERR_OR_ZERO(update);
-	if (ret)
-		return ret;
-
-	if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
-		return 0;
-
-	ret =   bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p) ?:
-		bch2_key_has_snapshot_overwrites(trans, iter->btree_id, (*insert)->k.p);
-	if (ret < 0)
-		return ret;
-	if (ret)
-		return 0;
-
-	ret = bch2_btree_delete_at(trans, iter, flags);
-	if (ret)
-		return ret;
-
-	*insert = update;
-	return 0;
-}
-
-static noinline int extent_back_merge(struct btree_trans *trans,
-				      struct btree_iter *iter,
-				      struct bkey_i *insert,
-				      struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	int ret;
-
-	if (unlikely(trans->journal_replay_not_finished))
-		return 0;
-
-	ret =   bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?:
-		bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p);
-	if (ret < 0)
-		return ret;
-	if (ret)
-		return 0;
-
-	bch2_bkey_merge(c, bkey_i_to_s(insert), k);
-	return 0;
-}
-
-/*
- * When deleting, check if we need to emit a whiteout (because we're overwriting
- * something in an ancestor snapshot)
- */
-static int need_whiteout_for_snapshot(struct btree_trans *trans,
-				      enum btree_id btree_id, struct bpos pos)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	u32 snapshot = pos.snapshot;
-	int ret;
-
-	if (!bch2_snapshot_parent(trans->c, pos.snapshot))
-		return 0;
-
-	pos.snapshot++;
-
-	for_each_btree_key_norestart(trans, iter, btree_id, pos,
-			   BTREE_ITER_all_snapshots|
-			   BTREE_ITER_nopreserve, k, ret) {
-		if (!bkey_eq(k.k->p, pos))
-			break;
-
-		if (bch2_snapshot_is_ancestor(trans->c, snapshot,
-					      k.k->p.snapshot)) {
-			ret = !bkey_whiteout(k.k);
-			break;
-		}
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	return ret;
-}
-
-int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
-				   enum btree_id id,
-				   struct bpos old_pos,
-				   struct bpos new_pos)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter old_iter, new_iter = { NULL };
-	struct bkey_s_c old_k, new_k;
-	snapshot_id_list s;
-	struct bkey_i *update;
-	int ret = 0;
-
-	if (!bch2_snapshot_has_children(c, old_pos.snapshot))
-		return 0;
-
-	darray_init(&s);
-
-	bch2_trans_iter_init(trans, &old_iter, id, old_pos,
-			     BTREE_ITER_not_extents|
-			     BTREE_ITER_all_snapshots);
-	while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
-	       !(ret = bkey_err(old_k)) &&
-	       bkey_eq(old_pos, old_k.k->p)) {
-		struct bpos whiteout_pos =
-			SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);;
-
-		if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
-		    snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
-			continue;
-
-		new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
-					   BTREE_ITER_not_extents|
-					   BTREE_ITER_intent);
-		ret = bkey_err(new_k);
-		if (ret)
-			break;
-
-		if (new_k.k->type == KEY_TYPE_deleted) {
-			update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
-			ret = PTR_ERR_OR_ZERO(update);
-			if (ret)
-				break;
-
-			bkey_init(&update->k);
-			update->k.p		= whiteout_pos;
-			update->k.type		= KEY_TYPE_whiteout;
-
-			ret = bch2_trans_update(trans, &new_iter, update,
-						BTREE_UPDATE_internal_snapshot_node);
-		}
-		bch2_trans_iter_exit(trans, &new_iter);
-
-		ret = snapshot_list_add(c, &s, old_k.k->p.snapshot);
-		if (ret)
-			break;
-	}
-	bch2_trans_iter_exit(trans, &new_iter);
-	bch2_trans_iter_exit(trans, &old_iter);
-	darray_exit(&s);
-
-	return ret;
-}
-
-int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
-				       struct btree_iter *iter,
-				       enum btree_iter_update_trigger_flags flags,
-				       struct bkey_s_c old,
-				       struct bkey_s_c new)
-{
-	enum btree_id btree_id = iter->btree_id;
-	struct bkey_i *update;
-	struct bpos new_start = bkey_start_pos(new.k);
-	unsigned front_split = bkey_lt(bkey_start_pos(old.k), new_start);
-	unsigned back_split  = bkey_gt(old.k->p, new.k->p);
-	unsigned middle_split = (front_split || back_split) &&
-		old.k->p.snapshot != new.k->p.snapshot;
-	unsigned nr_splits = front_split + back_split + middle_split;
-	int ret = 0, compressed_sectors;
-
-	/*
-	 * If we're going to be splitting a compressed extent, note it
-	 * so that __bch2_trans_commit() can increase our disk
-	 * reservation:
-	 */
-	if (nr_splits > 1 &&
-	    (compressed_sectors = bch2_bkey_sectors_compressed(old)))
-		trans->extra_disk_res += compressed_sectors * (nr_splits - 1);
-
-	if (front_split) {
-		update = bch2_bkey_make_mut_noupdate(trans, old);
-		if ((ret = PTR_ERR_OR_ZERO(update)))
-			return ret;
-
-		bch2_cut_back(new_start, update);
-
-		ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
-					old.k->p, update->k.p) ?:
-			bch2_btree_insert_nonextent(trans, btree_id, update,
-					BTREE_UPDATE_internal_snapshot_node|flags);
-		if (ret)
-			return ret;
-	}
-
-	/* If we're overwriting in a different snapshot - middle split: */
-	if (middle_split) {
-		update = bch2_bkey_make_mut_noupdate(trans, old);
-		if ((ret = PTR_ERR_OR_ZERO(update)))
-			return ret;
-
-		bch2_cut_front(new_start, update);
-		bch2_cut_back(new.k->p, update);
-
-		ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
-					old.k->p, update->k.p) ?:
-			bch2_btree_insert_nonextent(trans, btree_id, update,
-					  BTREE_UPDATE_internal_snapshot_node|flags);
-		if (ret)
-			return ret;
-	}
-
-	if (bkey_le(old.k->p, new.k->p)) {
-		update = bch2_trans_kmalloc(trans, sizeof(*update));
-		if ((ret = PTR_ERR_OR_ZERO(update)))
-			return ret;
-
-		bkey_init(&update->k);
-		update->k.p = old.k->p;
-		update->k.p.snapshot = new.k->p.snapshot;
-
-		if (new.k->p.snapshot != old.k->p.snapshot) {
-			update->k.type = KEY_TYPE_whiteout;
-		} else if (btree_type_has_snapshots(btree_id)) {
-			ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
-			if (ret < 0)
-				return ret;
-			if (ret)
-				update->k.type = KEY_TYPE_whiteout;
-		}
-
-		ret = bch2_btree_insert_nonextent(trans, btree_id, update,
-					  BTREE_UPDATE_internal_snapshot_node|flags);
-		if (ret)
-			return ret;
-	}
-
-	if (back_split) {
-		update = bch2_bkey_make_mut_noupdate(trans, old);
-		if ((ret = PTR_ERR_OR_ZERO(update)))
-			return ret;
-
-		bch2_cut_front(new.k->p, update);
-
-		ret = bch2_trans_update_by_path(trans, iter->path, update,
-					  BTREE_UPDATE_internal_snapshot_node|
-					  flags, _RET_IP_);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-static int bch2_trans_update_extent(struct btree_trans *trans,
-				    struct btree_iter *orig_iter,
-				    struct bkey_i *insert,
-				    enum btree_iter_update_trigger_flags flags)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	enum btree_id btree_id = orig_iter->btree_id;
-	int ret = 0;
-
-	bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k),
-			     BTREE_ITER_intent|
-			     BTREE_ITER_with_updates|
-			     BTREE_ITER_not_extents);
-	k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
-	if ((ret = bkey_err(k)))
-		goto err;
-	if (!k.k)
-		goto out;
-
-	if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) {
-		if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
-			ret = extent_front_merge(trans, &iter, k, &insert, flags);
-			if (ret)
-				goto err;
-		}
-
-		goto next;
-	}
-
-	while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
-		bool done = bkey_lt(insert->k.p, k.k->p);
-
-		ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert));
-		if (ret)
-			goto err;
-
-		if (done)
-			goto out;
-next:
-		bch2_btree_iter_advance(&iter);
-		k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
-		if ((ret = bkey_err(k)))
-			goto err;
-		if (!k.k)
-			goto out;
-	}
-
-	if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
-		ret = extent_back_merge(trans, &iter, insert, k);
-		if (ret)
-			goto err;
-	}
-out:
-	if (!bkey_deleted(&insert->k))
-		ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-
-	return ret;
-}
-
-static noinline int flush_new_cached_update(struct btree_trans *trans,
-					    struct btree_insert_entry *i,
-					    enum btree_iter_update_trigger_flags flags,
-					    unsigned long ip)
-{
-	struct bkey k;
-	int ret;
-
-	btree_path_idx_t path_idx =
-		bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0,
-			      BTREE_ITER_intent, _THIS_IP_);
-	ret = bch2_btree_path_traverse(trans, path_idx, 0);
-	if (ret)
-		goto out;
-
-	struct btree_path *btree_path = trans->paths + path_idx;
-
-	/*
-	 * The old key in the insert entry might actually refer to an existing
-	 * key in the btree that has been deleted from cache and not yet
-	 * flushed. Check for this and skip the flush so we don't run triggers
-	 * against a stale key.
-	 */
-	bch2_btree_path_peek_slot_exact(btree_path, &k);
-	if (!bkey_deleted(&k))
-		goto out;
-
-	i->key_cache_already_flushed = true;
-	i->flags |= BTREE_TRIGGER_norun;
-
-	btree_path_set_should_be_locked(btree_path);
-	ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip);
-out:
-	bch2_path_put(trans, path_idx, true);
-	return ret;
-}
-
-static int __must_check
-bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
-			  struct bkey_i *k, enum btree_iter_update_trigger_flags flags,
-			  unsigned long ip)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_insert_entry *i, n;
-	int cmp;
-
-	struct btree_path *path = trans->paths + path_idx;
-	EBUG_ON(!path->should_be_locked);
-	EBUG_ON(trans->nr_updates >= trans->nr_paths);
-	EBUG_ON(!bpos_eq(k->k.p, path->pos));
-
-	n = (struct btree_insert_entry) {
-		.flags		= flags,
-		.bkey_type	= __btree_node_type(path->level, path->btree_id),
-		.btree_id	= path->btree_id,
-		.level		= path->level,
-		.cached		= path->cached,
-		.path		= path_idx,
-		.k		= k,
-		.ip_allocated	= ip,
-	};
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-	trans_for_each_update(trans, i)
-		BUG_ON(i != trans->updates &&
-		       btree_insert_entry_cmp(i - 1, i) >= 0);
-#endif
-
-	/*
-	 * Pending updates are kept sorted: first, find position of new update,
-	 * then delete/trim any updates the new update overwrites:
-	 */
-	for (i = trans->updates; i < trans->updates + trans->nr_updates; i++) {
-		cmp = btree_insert_entry_cmp(&n, i);
-		if (cmp <= 0)
-			break;
-	}
-
-	if (!cmp && i < trans->updates + trans->nr_updates) {
-		EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
-
-		bch2_path_put(trans, i->path, true);
-		i->flags	= n.flags;
-		i->cached	= n.cached;
-		i->k		= n.k;
-		i->path		= n.path;
-		i->ip_allocated	= n.ip_allocated;
-	} else {
-		array_insert_item(trans->updates, trans->nr_updates,
-				  i - trans->updates, n);
-
-		i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v;
-		i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
-
-		if (unlikely(trans->journal_replay_not_finished)) {
-			struct bkey_i *j_k =
-				bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
-
-			if (j_k) {
-				i->old_k = j_k->k;
-				i->old_v = &j_k->v;
-			}
-		}
-	}
-
-	__btree_path_get(trans->paths + i->path, true);
-
-	/*
-	 * If a key is present in the key cache, it must also exist in the
-	 * btree - this is necessary for cache coherency. When iterating over
-	 * a btree that's cached in the key cache, the btree iter code checks
-	 * the key cache - but the key has to exist in the btree for that to
-	 * work:
-	 */
-	if (path->cached && !i->old_btree_u64s)
-		return flush_new_cached_update(trans, i, flags, ip);
-
-	return 0;
-}
-
-static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
-						    struct btree_iter *iter,
-						    struct btree_path *path)
-{
-	struct btree_path *key_cache_path = btree_iter_key_cache_path(trans, iter);
-
-	if (!key_cache_path ||
-	    !key_cache_path->should_be_locked ||
-	    !bpos_eq(key_cache_path->pos, iter->pos)) {
-		struct bkey_cached *ck;
-		int ret;
-
-		if (!iter->key_cache_path)
-			iter->key_cache_path =
-				bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
-					      BTREE_ITER_intent|
-					      BTREE_ITER_cached, _THIS_IP_);
-
-		iter->key_cache_path =
-			bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
-						iter->flags & BTREE_ITER_intent,
-						_THIS_IP_);
-
-		ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_cached);
-		if (unlikely(ret))
-			return ret;
-
-		ck = (void *) trans->paths[iter->key_cache_path].l[0].b;
-
-		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-			trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
-			return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
-		}
-
-		btree_path_set_should_be_locked(trans->paths + iter->key_cache_path);
-	}
-
-	return 0;
-}
-
-int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
-				   struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
-{
-	btree_path_idx_t path_idx = iter->update_path ?: iter->path;
-	int ret;
-
-	if (iter->flags & BTREE_ITER_is_extents)
-		return bch2_trans_update_extent(trans, iter, k, flags);
-
-	if (bkey_deleted(&k->k) &&
-	    !(flags & BTREE_UPDATE_key_cache_reclaim) &&
-	    (iter->flags & BTREE_ITER_filter_snapshots)) {
-		ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
-		if (unlikely(ret < 0))
-			return ret;
-
-		if (ret)
-			k->k.type = KEY_TYPE_whiteout;
-	}
-
-	/*
-	 * Ensure that updates to cached btrees go to the key cache:
-	 */
-	struct btree_path *path = trans->paths + path_idx;
-	if (!(flags & BTREE_UPDATE_key_cache_reclaim) &&
-	    !path->cached &&
-	    !path->level &&
-	    btree_id_cached(trans->c, path->btree_id)) {
-		ret = bch2_trans_update_get_key_cache(trans, iter, path);
-		if (ret)
-			return ret;
-
-		path_idx = iter->key_cache_path;
-	}
-
-	return bch2_trans_update_by_path(trans, path_idx, k, flags, _RET_IP_);
-}
-
-int bch2_btree_insert_clone_trans(struct btree_trans *trans,
-				  enum btree_id btree,
-				  struct bkey_i *k)
-{
-	struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(&k->k));
-	int ret = PTR_ERR_OR_ZERO(n);
-	if (ret)
-		return ret;
-
-	bkey_copy(n, k);
-	return bch2_btree_insert_trans(trans, btree, n, 0);
-}
-
-struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
-{
-	unsigned new_top = trans->journal_entries_u64s + u64s;
-	unsigned old_size = trans->journal_entries_size;
-
-	if (new_top > trans->journal_entries_size) {
-		trans->journal_entries_size = roundup_pow_of_two(new_top);
-
-		btree_trans_stats(trans)->journal_entries_size = trans->journal_entries_size;
-	}
-
-	struct jset_entry *n =
-		bch2_trans_kmalloc_nomemzero(trans,
-				trans->journal_entries_size * sizeof(u64));
-	if (IS_ERR(n))
-		return ERR_CAST(n);
-
-	if (trans->journal_entries)
-		memcpy(n, trans->journal_entries, old_size * sizeof(u64));
-	trans->journal_entries = n;
-
-	struct jset_entry *e = btree_trans_journal_entries_top(trans);
-	trans->journal_entries_u64s = new_top;
-	return e;
-}
-
-int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
-			     enum btree_id btree, struct bpos end)
-{
-	struct bkey_s_c k;
-	int ret = 0;
-
-	bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_intent);
-	k = bch2_btree_iter_prev(iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	bch2_btree_iter_advance(iter);
-	k = bch2_btree_iter_peek_slot(iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	BUG_ON(k.k->type != KEY_TYPE_deleted);
-
-	if (bkey_gt(k.k->p, end)) {
-		ret = -BCH_ERR_ENOSPC_btree_slot;
-		goto err;
-	}
-
-	return 0;
-err:
-	bch2_trans_iter_exit(trans, iter);
-	return ret;
-}
-
-void bch2_trans_commit_hook(struct btree_trans *trans,
-			    struct btree_trans_commit_hook *h)
-{
-	h->next = trans->hooks;
-	trans->hooks = h;
-}
-
-int bch2_btree_insert_nonextent(struct btree_trans *trans,
-				enum btree_id btree, struct bkey_i *k,
-				enum btree_iter_update_trigger_flags flags)
-{
-	struct btree_iter iter;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, btree, k->k.p,
-			     BTREE_ITER_cached|
-			     BTREE_ITER_not_extents|
-			     BTREE_ITER_intent);
-	ret   = bch2_btree_iter_traverse(&iter) ?:
-		bch2_trans_update(trans, &iter, k, flags);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
-			    struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
-{
-	struct btree_iter iter;
-	bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
-			     BTREE_ITER_intent|flags);
-	int ret = bch2_btree_iter_traverse(&iter) ?:
-		  bch2_trans_update(trans, &iter, k, flags);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-/**
- * bch2_btree_insert - insert keys into the extent btree
- * @c:			pointer to struct bch_fs
- * @id:			btree to insert into
- * @k:			key to insert
- * @disk_res:		must be non-NULL whenever inserting or potentially
- *			splitting data extents
- * @flags:		transaction commit flags
- *
- * Returns:		0 on success, error code on failure
- */
-int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k,
-		      struct disk_reservation *disk_res, int flags)
-{
-	return bch2_trans_do(c, disk_res, NULL, flags,
-			     bch2_btree_insert_trans(trans, id, k, 0));
-}
-
-int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
-				unsigned len, unsigned update_flags)
-{
-	struct bkey_i *k;
-
-	k = bch2_trans_kmalloc(trans, sizeof(*k));
-	if (IS_ERR(k))
-		return PTR_ERR(k);
-
-	bkey_init(&k->k);
-	k->k.p = iter->pos;
-	bch2_key_resize(&k->k, len);
-	return bch2_trans_update(trans, iter, k, update_flags);
-}
-
-int bch2_btree_delete_at(struct btree_trans *trans,
-			 struct btree_iter *iter, unsigned update_flags)
-{
-	return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
-}
-
-int bch2_btree_delete(struct btree_trans *trans,
-		      enum btree_id btree, struct bpos pos,
-		      unsigned update_flags)
-{
-	struct btree_iter iter;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, btree, pos,
-			     BTREE_ITER_cached|
-			     BTREE_ITER_intent);
-	ret   = bch2_btree_iter_traverse(&iter) ?:
-		bch2_btree_delete_at(trans, &iter, update_flags);
-	bch2_trans_iter_exit(trans, &iter);
-
-	return ret;
-}
-
-int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
-				  struct bpos start, struct bpos end,
-				  unsigned update_flags,
-				  u64 *journal_seq)
-{
-	u32 restart_count = trans->restart_count;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent);
-	while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) {
-		struct disk_reservation disk_res =
-			bch2_disk_reservation_init(trans->c, 0);
-		struct bkey_i delete;
-
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
-
-		bkey_init(&delete.k);
-
-		/*
-		 * This could probably be more efficient for extents:
-		 */
-
-		/*
-		 * For extents, iter.pos won't necessarily be the same as
-		 * bkey_start_pos(k.k) (for non extents they always will be the
-		 * same). It's important that we delete starting from iter.pos
-		 * because the range we want to delete could start in the middle
-		 * of k.
-		 *
-		 * (bch2_btree_iter_peek() does guarantee that iter.pos >=
-		 * bkey_start_pos(k.k)).
-		 */
-		delete.k.p = iter.pos;
-
-		if (iter.flags & BTREE_ITER_is_extents)
-			bch2_key_resize(&delete.k,
-					bpos_min(end, k.k->p).offset -
-					iter.pos.offset);
-
-		ret   = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
-			bch2_trans_commit(trans, &disk_res, journal_seq,
-					  BCH_TRANS_COMMIT_no_enospc);
-		bch2_disk_reservation_put(trans->c, &disk_res);
-err:
-		/*
-		 * the bch2_trans_begin() call is in a weird place because we
-		 * need to call it after every transaction commit, to avoid path
-		 * overflow, but don't want to call it if the delete operation
-		 * is a no-op and we have no work to do:
-		 */
-		bch2_trans_begin(trans);
-
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			ret = 0;
-		if (ret)
-			break;
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	return ret ?: trans_was_restarted(trans, restart_count);
-}
-
-/*
- * bch_btree_delete_range - delete everything within a given range
- *
- * Range is a half open interval - [start, end)
- */
-int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
-			    struct bpos start, struct bpos end,
-			    unsigned update_flags,
-			    u64 *journal_seq)
-{
-	int ret = bch2_trans_run(c,
-			bch2_btree_delete_range_trans(trans, id, start, end,
-						      update_flags, journal_seq));
-	if (ret == -BCH_ERR_transaction_restart_nested)
-		ret = 0;
-	return ret;
-}
-
-int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
-		       struct bpos pos, bool set)
-{
-	struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
-	int ret = PTR_ERR_OR_ZERO(k);
-	if (ret)
-		return ret;
-
-	bkey_init(&k->k);
-	k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
-	k->k.p = pos;
-
-	struct btree_iter iter;
-	bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
-
-	ret   = bch2_btree_iter_traverse(&iter) ?:
-		bch2_trans_update(trans, &iter, k, 0);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree,
-				struct bpos pos, bool set)
-{
-	struct bkey_i k;
-
-	bkey_init(&k.k);
-	k.k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
-	k.k.p = pos;
-
-	return bch2_trans_update_buffered(trans, btree, &k);
-}
-
-static int __bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf, unsigned u64s)
-{
-	struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s));
-	int ret = PTR_ERR_OR_ZERO(e);
-	if (ret)
-		return ret;
-
-	struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry);
-	journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s);
-	memcpy(l->d, buf->buf, buf->pos);
-	return 0;
-}
-
-__printf(3, 0)
-static int
-__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
-		  va_list args)
-{
-	struct printbuf buf = PRINTBUF;
-	prt_vprintf(&buf, fmt, args);
-
-	unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
-	prt_chars(&buf, '\0', u64s * sizeof(u64) - buf.pos);
-
-	int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
-	if (ret)
-		goto err;
-
-	if (!test_bit(JOURNAL_running, &c->journal.flags)) {
-		ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s));
-		if (ret)
-			goto err;
-
-		struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries);
-		journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s);
-		memcpy(l->d, buf.buf, buf.pos);
-		c->journal.early_journal_entries.nr += jset_u64s(u64s);
-	} else {
-		ret = bch2_trans_do(c, NULL, NULL,
-			BCH_TRANS_COMMIT_lazy_rw|commit_flags,
-			__bch2_trans_log_msg(trans, &buf, u64s));
-	}
-err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-__printf(2, 3)
-int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
-{
-	va_list args;
-	int ret;
-
-	va_start(args, fmt);
-	ret = __bch2_fs_log_msg(c, 0, fmt, args);
-	va_end(args);
-	return ret;
-}
-
-/*
- * Use for logging messages during recovery to enable reserved space and avoid
- * blocking.
- */
-__printf(2, 3)
-int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
-{
-	va_list args;
-	int ret;
-
-	va_start(args, fmt);
-	ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args);
-	va_end(args);
-	return ret;
-}
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
deleted file mode 100644
index b4894e4d5447..000000000000
--- a/fs/bcachefs/btree_update.h
+++ /dev/null
@@ -1,364 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_UPDATE_H
-#define _BCACHEFS_BTREE_UPDATE_H
-
-#include "btree_iter.h"
-#include "journal.h"
-
-struct bch_fs;
-struct btree;
-
-void bch2_btree_node_prep_for_write(struct btree_trans *,
-				    struct btree_path *, struct btree *);
-bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
-				struct btree *, struct btree_node_iter *,
-				struct bkey_i *);
-
-int bch2_btree_node_flush0(struct journal *, struct journal_entry_pin *, u64);
-int bch2_btree_node_flush1(struct journal *, struct journal_entry_pin *, u64);
-void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
-
-void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
-				struct bkey_i *, u64);
-
-#define BCH_TRANS_COMMIT_FLAGS()							\
-	x(no_enospc,	"don't check for enospc")					\
-	x(no_check_rw,	"don't attempt to take a ref on c->writes")			\
-	x(lazy_rw,	"go read-write if we haven't yet - only for use in recovery")	\
-	x(no_journal_res, "don't take a journal reservation, instead "			\
-			"pin journal entry referred to by trans->journal_res.seq")	\
-	x(journal_reclaim, "operation required for journal reclaim; may return error"	\
-			"instead of deadlocking if BCH_WATERMARK_reclaim not specified")\
-
-enum __bch_trans_commit_flags {
-	/* First bits for bch_watermark: */
-	__BCH_TRANS_COMMIT_FLAGS_START = BCH_WATERMARK_BITS,
-#define x(n, ...)	__BCH_TRANS_COMMIT_##n,
-	BCH_TRANS_COMMIT_FLAGS()
-#undef x
-};
-
-enum bch_trans_commit_flags {
-#define x(n, ...)	BCH_TRANS_COMMIT_##n = BIT(__BCH_TRANS_COMMIT_##n),
-	BCH_TRANS_COMMIT_FLAGS()
-#undef x
-};
-
-void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags);
-
-int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
-				unsigned, unsigned);
-int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
-int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned);
-
-int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
-				struct bkey_i *, enum btree_iter_update_trigger_flags);
-
-int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *,
-			enum btree_iter_update_trigger_flags);
-int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
-		     struct disk_reservation *, int flags);
-
-int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
-				  struct bpos, struct bpos, unsigned, u64 *);
-int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
-			    struct bpos, struct bpos, unsigned, u64 *);
-
-int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
-int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool);
-
-static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans,
-						enum btree_id btree, struct bpos pos)
-{
-	return bch2_btree_bit_mod_buffered(trans, btree, pos, false);
-}
-
-int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
-				     struct bpos, struct bpos);
-
-/*
- * For use when splitting extents in existing snapshots:
- *
- * If @old_pos is an interior snapshot node, iterate over descendent snapshot
- * nodes: for every descendent snapshot in whiche @old_pos is overwritten and
- * not visible, emit a whiteout at @new_pos.
- */
-static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
-						 enum btree_id btree,
-						 struct bpos old_pos,
-						 struct bpos new_pos)
-{
-	if (!btree_type_has_snapshots(btree) ||
-	    bkey_eq(old_pos, new_pos))
-		return 0;
-
-	return __bch2_insert_snapshot_whiteouts(trans, btree, old_pos, new_pos);
-}
-
-int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *,
-				       enum btree_iter_update_trigger_flags,
-				       struct bkey_s_c, struct bkey_s_c);
-
-int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *,
-			     enum btree_id, struct bpos);
-
-int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
-				   struct bkey_i *, enum btree_iter_update_trigger_flags);
-
-struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned);
-
-static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans)
-{
-	return (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
-}
-
-static inline struct jset_entry *
-bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
-{
-	if (!trans->journal_entries ||
-	    trans->journal_entries_u64s + u64s > trans->journal_entries_size)
-		return __bch2_trans_jset_entry_alloc(trans, u64s);
-
-	struct jset_entry *e = btree_trans_journal_entries_top(trans);
-	trans->journal_entries_u64s += u64s;
-	return e;
-}
-
-int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *);
-
-static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
-					    enum btree_id btree,
-					    struct bkey_i *k)
-{
-	if (unlikely(trans->journal_replay_not_finished))
-		return bch2_btree_insert_clone_trans(trans, btree, k);
-
-	struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s));
-	int ret = PTR_ERR_OR_ZERO(e);
-	if (ret)
-		return ret;
-
-	journal_entry_init(e, BCH_JSET_ENTRY_write_buffer_keys, btree, 0, k->k.u64s);
-	bkey_copy(e->start, k);
-	return 0;
-}
-
-void bch2_trans_commit_hook(struct btree_trans *,
-			    struct btree_trans_commit_hook *);
-int __bch2_trans_commit(struct btree_trans *, unsigned);
-
-__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
-__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
-
-/**
- * bch2_trans_commit - insert keys at given iterator positions
- *
- * This is main entry point for btree updates.
- *
- * Return values:
- * -EROFS: filesystem read only
- * -EIO: journal or btree node IO error
- */
-static inline int bch2_trans_commit(struct btree_trans *trans,
-				    struct disk_reservation *disk_res,
-				    u64 *journal_seq,
-				    unsigned flags)
-{
-	trans->disk_res		= disk_res;
-	trans->journal_seq	= journal_seq;
-
-	return __bch2_trans_commit(trans, flags);
-}
-
-#define commit_do(_trans, _disk_res, _journal_seq, _flags, _do)	\
-	lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
-					(_journal_seq), (_flags)))
-
-#define nested_commit_do(_trans, _disk_res, _journal_seq, _flags, _do)	\
-	nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
-					(_journal_seq), (_flags)))
-
-#define bch2_trans_run(_c, _do)						\
-({									\
-	struct btree_trans *trans = bch2_trans_get(_c);			\
-	int _ret = (_do);						\
-	bch2_trans_put(trans);						\
-	_ret;								\
-})
-
-#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do)		\
-	bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do))
-
-#define trans_for_each_update(_trans, _i)				\
-	for (struct btree_insert_entry *_i = (_trans)->updates;		\
-	     (_i) < (_trans)->updates + (_trans)->nr_updates;		\
-	     (_i)++)
-
-static inline void bch2_trans_reset_updates(struct btree_trans *trans)
-{
-	trans_for_each_update(trans, i)
-		bch2_path_put(trans, i->path, true);
-
-	trans->nr_updates		= 0;
-	trans->journal_entries_u64s	= 0;
-	trans->hooks			= NULL;
-	trans->extra_disk_res		= 0;
-
-	if (trans->fs_usage_deltas) {
-		trans->fs_usage_deltas->used = 0;
-		memset((void *) trans->fs_usage_deltas +
-		       offsetof(struct replicas_delta_list, memset_start), 0,
-		       (void *) &trans->fs_usage_deltas->memset_end -
-		       (void *) &trans->fs_usage_deltas->memset_start);
-	}
-}
-
-static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k,
-						  unsigned type, unsigned min_bytes)
-{
-	unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k));
-	struct bkey_i *mut;
-
-	if (type && k.k->type != type)
-		return ERR_PTR(-ENOENT);
-
-	mut = bch2_trans_kmalloc_nomemzero(trans, bytes);
-	if (!IS_ERR(mut)) {
-		bkey_reassemble(mut, k);
-
-		if (unlikely(bytes > bkey_bytes(k.k))) {
-			memset((void *) mut + bkey_bytes(k.k), 0,
-			       bytes - bkey_bytes(k.k));
-			mut->k.u64s = DIV_ROUND_UP(bytes, sizeof(u64));
-		}
-	}
-	return mut;
-}
-
-static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k)
-{
-	return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0);
-}
-
-#define bch2_bkey_make_mut_noupdate_typed(_trans, _k, _type)		\
-	bkey_i_to_##_type(__bch2_bkey_make_mut_noupdate(_trans, _k,	\
-				KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
-
-static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
-					struct bkey_s_c *k, unsigned flags,
-					unsigned type, unsigned min_bytes)
-{
-	struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes);
-	int ret;
-
-	if (IS_ERR(mut))
-		return mut;
-
-	ret = bch2_trans_update(trans, iter, mut, flags);
-	if (ret)
-		return ERR_PTR(ret);
-
-	*k = bkey_i_to_s_c(mut);
-	return mut;
-}
-
-static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
-						struct bkey_s_c *k, unsigned flags)
-{
-	return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0);
-}
-
-#define bch2_bkey_make_mut_typed(_trans, _iter, _k, _flags, _type)	\
-	bkey_i_to_##_type(__bch2_bkey_make_mut(_trans, _iter, _k, _flags,\
-				KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
-
-static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
-					 struct btree_iter *iter,
-					 unsigned btree_id, struct bpos pos,
-					 unsigned flags, unsigned type, unsigned min_bytes)
-{
-	struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter,
-				btree_id, pos, flags|BTREE_ITER_intent, type);
-	struct bkey_i *ret = IS_ERR(k.k)
-		? ERR_CAST(k.k)
-		: __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes);
-	if (IS_ERR(ret))
-		bch2_trans_iter_exit(trans, iter);
-	return ret;
-}
-
-static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
-					       struct btree_iter *iter,
-					       unsigned btree_id, struct bpos pos,
-					       unsigned flags)
-{
-	return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0);
-}
-
-static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans,
-					 struct btree_iter *iter,
-					 unsigned btree_id, struct bpos pos,
-					 unsigned flags, unsigned type, unsigned min_bytes)
-{
-	struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter,
-				btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes);
-	int ret;
-
-	if (IS_ERR(mut))
-		return mut;
-
-	ret = bch2_trans_update(trans, iter, mut, flags);
-	if (ret) {
-		bch2_trans_iter_exit(trans, iter);
-		return ERR_PTR(ret);
-	}
-
-	return mut;
-}
-
-static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans,
-						       struct btree_iter *iter,
-						       unsigned btree_id, struct bpos pos,
-						       unsigned flags, unsigned min_bytes)
-{
-	return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes);
-}
-
-static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
-					       struct btree_iter *iter,
-					       unsigned btree_id, struct bpos pos,
-					       unsigned flags)
-{
-	return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0);
-}
-
-#define bch2_bkey_get_mut_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\
-	bkey_i_to_##_type(__bch2_bkey_get_mut(_trans, _iter,		\
-			_btree_id, _pos, _flags,			\
-			KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
-
-static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter,
-					       unsigned flags, unsigned type, unsigned val_size)
-{
-	struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size);
-	int ret;
-
-	if (IS_ERR(k))
-		return k;
-
-	bkey_init(&k->k);
-	k->k.p = iter->pos;
-	k->k.type = type;
-	set_bkey_val_bytes(&k->k, val_size);
-
-	ret = bch2_trans_update(trans, iter, k, flags);
-	if (unlikely(ret))
-		return ERR_PTR(ret);
-	return k;
-}
-
-#define bch2_bkey_alloc(_trans, _iter, _flags, _type)			\
-	bkey_i_to_##_type(__bch2_bkey_alloc(_trans, _iter, _flags,	\
-				KEY_TYPE_##_type, sizeof(struct bch_##_type)))
-
-#endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
deleted file mode 100644
index 60b8544cea48..000000000000
--- a/fs/bcachefs/btree_update_interior.c
+++ /dev/null
@@ -1,2688 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-#include "btree_gc.h"
-#include "btree_journal_iter.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "buckets.h"
-#include "clock.h"
-#include "error.h"
-#include "extents.h"
-#include "journal.h"
-#include "journal_reclaim.h"
-#include "keylist.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "sb-members.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/random.h>
-
-static const char * const bch2_btree_update_modes[] = {
-#define x(t) #t,
-	BTREE_UPDATE_MODES()
-#undef x
-	NULL
-};
-
-static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
-				  btree_path_idx_t, struct btree *, struct keylist *);
-static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
-
-/*
- * Verify that child nodes correctly span parent node's range:
- */
-int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
-{
-	struct bch_fs *c = trans->c;
-	struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2
-		? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key
-		: b->data->min_key;
-	struct btree_and_journal_iter iter;
-	struct bkey_s_c k;
-	struct printbuf buf = PRINTBUF;
-	struct bkey_buf prev;
-	int ret = 0;
-
-	BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-	       !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
-			b->data->min_key));
-
-	if (b == btree_node_root(c, b)) {
-		if (!bpos_eq(b->data->min_key, POS_MIN)) {
-			printbuf_reset(&buf);
-			bch2_bpos_to_text(&buf, b->data->min_key);
-			need_fsck_err(c, btree_root_bad_min_key,
-				      "btree root with incorrect min_key: %s", buf.buf);
-			goto topology_repair;
-		}
-
-		if (!bpos_eq(b->data->max_key, SPOS_MAX)) {
-			printbuf_reset(&buf);
-			bch2_bpos_to_text(&buf, b->data->max_key);
-			need_fsck_err(c, btree_root_bad_max_key,
-				      "btree root with incorrect max_key: %s", buf.buf);
-			goto topology_repair;
-		}
-	}
-
-	if (!b->c.level)
-		return 0;
-
-	bch2_bkey_buf_init(&prev);
-	bkey_init(&prev.k->k);
-	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
-
-	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-		if (k.k->type != KEY_TYPE_btree_ptr_v2)
-			goto out;
-
-		struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
-
-		struct bpos expected_min = bkey_deleted(&prev.k->k)
-			? node_min
-			: bpos_successor(prev.k->k.p);
-
-		if (!bpos_eq(expected_min, bp.v->min_key)) {
-			bch2_topology_error(c);
-
-			printbuf_reset(&buf);
-			prt_str(&buf, "end of prev node doesn't match start of next node\n"),
-			prt_printf(&buf, "  in btree %s level %u node ",
-				   bch2_btree_id_str(b->c.btree_id), b->c.level);
-			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-			prt_str(&buf, "\n  prev ");
-			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
-			prt_str(&buf, "\n  next ");
-			bch2_bkey_val_to_text(&buf, c, k);
-
-			need_fsck_err(c, btree_node_topology_bad_min_key, "%s", buf.buf);
-			goto topology_repair;
-		}
-
-		bch2_bkey_buf_reassemble(&prev, c, k);
-		bch2_btree_and_journal_iter_advance(&iter);
-	}
-
-	if (bkey_deleted(&prev.k->k)) {
-		bch2_topology_error(c);
-
-		printbuf_reset(&buf);
-		prt_str(&buf, "empty interior node\n");
-		prt_printf(&buf, "  in btree %s level %u node ",
-			   bch2_btree_id_str(b->c.btree_id), b->c.level);
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
-		need_fsck_err(c, btree_node_topology_empty_interior_node, "%s", buf.buf);
-		goto topology_repair;
-	} else if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
-		bch2_topology_error(c);
-
-		printbuf_reset(&buf);
-		prt_str(&buf, "last child node doesn't end at end of parent node\n");
-		prt_printf(&buf, "  in btree %s level %u node ",
-			   bch2_btree_id_str(b->c.btree_id), b->c.level);
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-		prt_str(&buf, "\n  last key ");
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
-
-		need_fsck_err(c, btree_node_topology_bad_max_key, "%s", buf.buf);
-		goto topology_repair;
-	}
-out:
-fsck_err:
-	bch2_btree_and_journal_iter_exit(&iter);
-	bch2_bkey_buf_exit(&prev, c);
-	printbuf_exit(&buf);
-	return ret;
-topology_repair:
-	if ((c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) &&
-	    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) {
-		bch2_inconsistent_error(c);
-		ret = -BCH_ERR_btree_need_topology_repair;
-	} else {
-		ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
-	}
-	goto out;
-}
-
-/* Calculate ideal packed bkey format for new btree nodes: */
-
-static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
-{
-	struct bkey_packed *k;
-	struct bkey uk;
-
-	for_each_bset(b, t)
-		bset_tree_for_each_key(b, t, k)
-			if (!bkey_deleted(k)) {
-				uk = bkey_unpack_key(b, k);
-				bch2_bkey_format_add_key(s, &uk);
-			}
-}
-
-static struct bkey_format bch2_btree_calc_format(struct btree *b)
-{
-	struct bkey_format_state s;
-
-	bch2_bkey_format_init(&s);
-	bch2_bkey_format_add_pos(&s, b->data->min_key);
-	bch2_bkey_format_add_pos(&s, b->data->max_key);
-	__bch2_btree_calc_format(&s, b);
-
-	return bch2_bkey_format_done(&s);
-}
-
-static size_t btree_node_u64s_with_format(struct btree_nr_keys nr,
-					  struct bkey_format *old_f,
-					  struct bkey_format *new_f)
-{
-	/* stupid integer promotion rules */
-	ssize_t delta =
-	    (((int) new_f->key_u64s - old_f->key_u64s) *
-	     (int) nr.packed_keys) +
-	    (((int) new_f->key_u64s - BKEY_U64s) *
-	     (int) nr.unpacked_keys);
-
-	BUG_ON(delta + nr.live_u64s < 0);
-
-	return nr.live_u64s + delta;
-}
-
-/**
- * bch2_btree_node_format_fits - check if we could rewrite node with a new format
- *
- * @c:		filesystem handle
- * @b:		btree node to rewrite
- * @nr:		number of keys for new node (i.e. b->nr)
- * @new_f:	bkey format to translate keys to
- *
- * Returns: true if all re-packed keys will be able to fit in a new node.
- *
- * Assumes all keys will successfully pack with the new format.
- */
-static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
-				 struct btree_nr_keys nr,
-				 struct bkey_format *new_f)
-{
-	size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f);
-
-	return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b);
-}
-
-/* Btree node freeing/allocation: */
-
-static void __btree_node_free(struct btree_trans *trans, struct btree *b)
-{
-	struct bch_fs *c = trans->c;
-
-	trace_and_count(c, btree_node_free, trans, b);
-
-	BUG_ON(btree_node_write_blocked(b));
-	BUG_ON(btree_node_dirty(b));
-	BUG_ON(btree_node_need_write(b));
-	BUG_ON(b == btree_node_root(c, b));
-	BUG_ON(b->ob.nr);
-	BUG_ON(!list_empty(&b->write_blocked));
-	BUG_ON(b->will_make_reachable);
-
-	clear_btree_node_noevict(b);
-
-	mutex_lock(&c->btree_cache.lock);
-	list_move(&b->list, &c->btree_cache.freeable);
-	mutex_unlock(&c->btree_cache.lock);
-}
-
-static void bch2_btree_node_free_inmem(struct btree_trans *trans,
-				       struct btree_path *path,
-				       struct btree *b)
-{
-	struct bch_fs *c = trans->c;
-	unsigned i, level = b->c.level;
-
-	bch2_btree_node_lock_write_nofail(trans, path, &b->c);
-	bch2_btree_node_hash_remove(&c->btree_cache, b);
-	__btree_node_free(trans, b);
-	six_unlock_write(&b->c.lock);
-	mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
-
-	trans_for_each_path(trans, path, i)
-		if (path->l[level].b == b) {
-			btree_node_unlock(trans, path, level);
-			path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
-		}
-}
-
-static void bch2_btree_node_free_never_used(struct btree_update *as,
-					    struct btree_trans *trans,
-					    struct btree *b)
-{
-	struct bch_fs *c = as->c;
-	struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
-	struct btree_path *path;
-	unsigned i, level = b->c.level;
-
-	BUG_ON(!list_empty(&b->write_blocked));
-	BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
-
-	b->will_make_reachable = 0;
-	closure_put(&as->cl);
-
-	clear_btree_node_will_make_reachable(b);
-	clear_btree_node_accessed(b);
-	clear_btree_node_dirty_acct(c, b);
-	clear_btree_node_need_write(b);
-
-	mutex_lock(&c->btree_cache.lock);
-	list_del_init(&b->list);
-	bch2_btree_node_hash_remove(&c->btree_cache, b);
-	mutex_unlock(&c->btree_cache.lock);
-
-	BUG_ON(p->nr >= ARRAY_SIZE(p->b));
-	p->b[p->nr++] = b;
-
-	six_unlock_intent(&b->c.lock);
-
-	trans_for_each_path(trans, path, i)
-		if (path->l[level].b == b) {
-			btree_node_unlock(trans, path, level);
-			path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
-		}
-}
-
-static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
-					     struct disk_reservation *res,
-					     struct closure *cl,
-					     bool interior_node,
-					     unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct write_point *wp;
-	struct btree *b;
-	BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
-	struct open_buckets obs = { .nr = 0 };
-	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
-	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
-	unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim
-		? BTREE_NODE_RESERVE
-		: 0;
-	int ret;
-
-	mutex_lock(&c->btree_reserve_cache_lock);
-	if (c->btree_reserve_cache_nr > nr_reserve) {
-		struct btree_alloc *a =
-			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
-
-		obs = a->ob;
-		bkey_copy(&tmp.k, &a->k);
-		mutex_unlock(&c->btree_reserve_cache_lock);
-		goto mem_alloc;
-	}
-	mutex_unlock(&c->btree_reserve_cache_lock);
-
-retry:
-	ret = bch2_alloc_sectors_start_trans(trans,
-				      c->opts.metadata_target ?:
-				      c->opts.foreground_target,
-				      0,
-				      writepoint_ptr(&c->btree_write_point),
-				      &devs_have,
-				      res->nr_replicas,
-				      min(res->nr_replicas,
-					  c->opts.metadata_replicas_required),
-				      watermark, 0, cl, &wp);
-	if (unlikely(ret))
-		return ERR_PTR(ret);
-
-	if (wp->sectors_free < btree_sectors(c)) {
-		struct open_bucket *ob;
-		unsigned i;
-
-		open_bucket_for_each(c, &wp->ptrs, ob, i)
-			if (ob->sectors_free < btree_sectors(c))
-				ob->sectors_free = 0;
-
-		bch2_alloc_sectors_done(c, wp);
-		goto retry;
-	}
-
-	bkey_btree_ptr_v2_init(&tmp.k);
-	bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false);
-
-	bch2_open_bucket_get(c, wp, &obs);
-	bch2_alloc_sectors_done(c, wp);
-mem_alloc:
-	b = bch2_btree_node_mem_alloc(trans, interior_node);
-	six_unlock_write(&b->c.lock);
-	six_unlock_intent(&b->c.lock);
-
-	/* we hold cannibalize_lock: */
-	BUG_ON(IS_ERR(b));
-	BUG_ON(b->ob.nr);
-
-	bkey_copy(&b->key, &tmp.k);
-	b->ob = obs;
-
-	return b;
-}
-
-static struct btree *bch2_btree_node_alloc(struct btree_update *as,
-					   struct btree_trans *trans,
-					   unsigned level)
-{
-	struct bch_fs *c = as->c;
-	struct btree *b;
-	struct prealloc_nodes *p = &as->prealloc_nodes[!!level];
-	int ret;
-
-	BUG_ON(level >= BTREE_MAX_DEPTH);
-	BUG_ON(!p->nr);
-
-	b = p->b[--p->nr];
-
-	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
-	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
-
-	set_btree_node_accessed(b);
-	set_btree_node_dirty_acct(c, b);
-	set_btree_node_need_write(b);
-
-	bch2_bset_init_first(b, &b->data->keys);
-	b->c.level	= level;
-	b->c.btree_id	= as->btree_id;
-	b->version_ondisk = c->sb.version;
-
-	memset(&b->nr, 0, sizeof(b->nr));
-	b->data->magic = cpu_to_le64(bset_magic(c));
-	memset(&b->data->_ptr, 0, sizeof(b->data->_ptr));
-	b->data->flags = 0;
-	SET_BTREE_NODE_ID(b->data, as->btree_id);
-	SET_BTREE_NODE_LEVEL(b->data, level);
-
-	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
-		struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
-
-		bp->v.mem_ptr		= 0;
-		bp->v.seq		= b->data->keys.seq;
-		bp->v.sectors_written	= 0;
-	}
-
-	SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
-
-	bch2_btree_build_aux_trees(b);
-
-	ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
-	BUG_ON(ret);
-
-	trace_and_count(c, btree_node_alloc, trans, b);
-	bch2_increment_clock(c, btree_sectors(c), WRITE);
-	return b;
-}
-
-static void btree_set_min(struct btree *b, struct bpos pos)
-{
-	if (b->key.k.type == KEY_TYPE_btree_ptr_v2)
-		bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos;
-	b->data->min_key = pos;
-}
-
-static void btree_set_max(struct btree *b, struct bpos pos)
-{
-	b->key.k.p = pos;
-	b->data->max_key = pos;
-}
-
-static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
-						       struct btree_trans *trans,
-						       struct btree *b)
-{
-	struct btree *n = bch2_btree_node_alloc(as, trans, b->c.level);
-	struct bkey_format format = bch2_btree_calc_format(b);
-
-	/*
-	 * The keys might expand with the new format - if they wouldn't fit in
-	 * the btree node anymore, use the old format for now:
-	 */
-	if (!bch2_btree_node_format_fits(as->c, b, b->nr, &format))
-		format = b->format;
-
-	SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
-
-	btree_set_min(n, b->data->min_key);
-	btree_set_max(n, b->data->max_key);
-
-	n->data->format		= format;
-	btree_node_set_format(n, format);
-
-	bch2_btree_sort_into(as->c, n, b);
-
-	btree_node_reset_sib_u64s(n);
-	return n;
-}
-
-static struct btree *__btree_root_alloc(struct btree_update *as,
-				struct btree_trans *trans, unsigned level)
-{
-	struct btree *b = bch2_btree_node_alloc(as, trans, level);
-
-	btree_set_min(b, POS_MIN);
-	btree_set_max(b, SPOS_MAX);
-	b->data->format = bch2_btree_calc_format(b);
-
-	btree_node_set_format(b, b->data->format);
-	bch2_btree_build_aux_trees(b);
-
-	return b;
-}
-
-static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans)
-{
-	struct bch_fs *c = as->c;
-	struct prealloc_nodes *p;
-
-	for (p = as->prealloc_nodes;
-	     p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
-	     p++) {
-		while (p->nr) {
-			struct btree *b = p->b[--p->nr];
-
-			mutex_lock(&c->btree_reserve_cache_lock);
-
-			if (c->btree_reserve_cache_nr <
-			    ARRAY_SIZE(c->btree_reserve_cache)) {
-				struct btree_alloc *a =
-					&c->btree_reserve_cache[c->btree_reserve_cache_nr++];
-
-				a->ob = b->ob;
-				b->ob.nr = 0;
-				bkey_copy(&a->k, &b->key);
-			} else {
-				bch2_open_buckets_put(c, &b->ob);
-			}
-
-			mutex_unlock(&c->btree_reserve_cache_lock);
-
-			btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
-			btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
-			__btree_node_free(trans, b);
-			six_unlock_write(&b->c.lock);
-			six_unlock_intent(&b->c.lock);
-		}
-	}
-}
-
-static int bch2_btree_reserve_get(struct btree_trans *trans,
-				  struct btree_update *as,
-				  unsigned nr_nodes[2],
-				  unsigned flags,
-				  struct closure *cl)
-{
-	struct btree *b;
-	unsigned interior;
-	int ret = 0;
-
-	BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
-
-	/*
-	 * Protects reaping from the btree node cache and using the btree node
-	 * open bucket reserve:
-	 */
-	ret = bch2_btree_cache_cannibalize_lock(trans, cl);
-	if (ret)
-		return ret;
-
-	for (interior = 0; interior < 2; interior++) {
-		struct prealloc_nodes *p = as->prealloc_nodes + interior;
-
-		while (p->nr < nr_nodes[interior]) {
-			b = __bch2_btree_node_alloc(trans, &as->disk_res, cl,
-						    interior, flags);
-			if (IS_ERR(b)) {
-				ret = PTR_ERR(b);
-				goto err;
-			}
-
-			p->b[p->nr++] = b;
-		}
-	}
-err:
-	bch2_btree_cache_cannibalize_unlock(trans);
-	return ret;
-}
-
-/* Asynchronous interior node update machinery */
-
-static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans)
-{
-	struct bch_fs *c = as->c;
-
-	if (as->took_gc_lock)
-		up_read(&c->gc_lock);
-	as->took_gc_lock = false;
-
-	bch2_journal_pin_drop(&c->journal, &as->journal);
-	bch2_journal_pin_flush(&c->journal, &as->journal);
-	bch2_disk_reservation_put(c, &as->disk_res);
-	bch2_btree_reserve_put(as, trans);
-
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
-			       as->start_time);
-
-	mutex_lock(&c->btree_interior_update_lock);
-	list_del(&as->unwritten_list);
-	list_del(&as->list);
-
-	closure_debug_destroy(&as->cl);
-	mempool_free(as, &c->btree_interior_update_pool);
-
-	/*
-	 * Have to do the wakeup with btree_interior_update_lock still held,
-	 * since being on btree_interior_update_list is our ref on @c:
-	 */
-	closure_wake_up(&c->btree_interior_update_wait);
-
-	mutex_unlock(&c->btree_interior_update_lock);
-}
-
-static void btree_update_add_key(struct btree_update *as,
-				 struct keylist *keys, struct btree *b)
-{
-	struct bkey_i *k = &b->key;
-
-	BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s >
-	       ARRAY_SIZE(as->_old_keys));
-
-	bkey_copy(keys->top, k);
-	bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1;
-
-	bch2_keylist_push(keys);
-}
-
-static bool btree_update_new_nodes_marked_sb(struct btree_update *as)
-{
-	for_each_keylist_key(&as->new_keys, k)
-		if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k)))
-			return false;
-	return true;
-}
-
-static void btree_update_new_nodes_mark_sb(struct btree_update *as)
-{
-	struct bch_fs *c = as->c;
-
-	mutex_lock(&c->sb_lock);
-	for_each_keylist_key(&as->new_keys, k)
-		bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k));
-
-	bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-}
-
-/*
- * The transactional part of an interior btree node update, where we journal the
- * update we did to the interior node and update alloc info:
- */
-static int btree_update_nodes_written_trans(struct btree_trans *trans,
-					    struct btree_update *as)
-{
-	struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, as->journal_u64s);
-	int ret = PTR_ERR_OR_ZERO(e);
-	if (ret)
-		return ret;
-
-	memcpy(e, as->journal_entries, as->journal_u64s * sizeof(u64));
-
-	trans->journal_pin = &as->journal;
-
-	for_each_keylist_key(&as->old_keys, k) {
-		unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
-
-		ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k),
-					   BTREE_TRIGGER_transactional);
-		if (ret)
-			return ret;
-	}
-
-	for_each_keylist_key(&as->new_keys, k) {
-		unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
-
-		ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k),
-					   BTREE_TRIGGER_transactional);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-static void btree_update_nodes_written(struct btree_update *as)
-{
-	struct bch_fs *c = as->c;
-	struct btree *b;
-	struct btree_trans *trans = bch2_trans_get(c);
-	u64 journal_seq = 0;
-	unsigned i;
-	int ret;
-
-	/*
-	 * If we're already in an error state, it might be because a btree node
-	 * was never written, and we might be trying to free that same btree
-	 * node here, but it won't have been marked as allocated and we'll see
-	 * spurious disk usage inconsistencies in the transactional part below
-	 * if we don't skip it:
-	 */
-	ret = bch2_journal_error(&c->journal);
-	if (ret)
-		goto err;
-
-	if (!btree_update_new_nodes_marked_sb(as))
-		btree_update_new_nodes_mark_sb(as);
-
-	/*
-	 * Wait for any in flight writes to finish before we free the old nodes
-	 * on disk:
-	 */
-	for (i = 0; i < as->nr_old_nodes; i++) {
-		__le64 seq;
-
-		b = as->old_nodes[i];
-
-		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-		seq = b->data ? b->data->keys.seq : 0;
-		six_unlock_read(&b->c.lock);
-
-		if (seq == as->old_nodes_seq[i])
-			wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
-				       TASK_UNINTERRUPTIBLE);
-	}
-
-	/*
-	 * We did an update to a parent node where the pointers we added pointed
-	 * to child nodes that weren't written yet: now, the child nodes have
-	 * been written so we can write out the update to the interior node.
-	 */
-
-	/*
-	 * We can't call into journal reclaim here: we'd block on the journal
-	 * reclaim lock, but we may need to release the open buckets we have
-	 * pinned in order for other btree updates to make forward progress, and
-	 * journal reclaim does btree updates when flushing bkey_cached entries,
-	 * which may require allocations as well.
-	 */
-	ret = commit_do(trans, &as->disk_res, &journal_seq,
-			BCH_WATERMARK_interior_updates|
-			BCH_TRANS_COMMIT_no_enospc|
-			BCH_TRANS_COMMIT_no_check_rw|
-			BCH_TRANS_COMMIT_journal_reclaim,
-			btree_update_nodes_written_trans(trans, as));
-	bch2_trans_unlock(trans);
-
-	bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
-			     "%s", bch2_err_str(ret));
-err:
-	/*
-	 * We have to be careful because another thread might be getting ready
-	 * to free as->b and calling btree_update_reparent() on us - we'll
-	 * recheck under btree_update_lock below:
-	 */
-	b = READ_ONCE(as->b);
-	if (b) {
-		/*
-		 * @b is the node we did the final insert into:
-		 *
-		 * On failure to get a journal reservation, we still have to
-		 * unblock the write and allow most of the write path to happen
-		 * so that shutdown works, but the i->journal_seq mechanism
-		 * won't work to prevent the btree write from being visible (we
-		 * didn't get a journal sequence number) - instead
-		 * __bch2_btree_node_write() doesn't do the actual write if
-		 * we're in journal error state:
-		 */
-
-		/*
-		 * Ensure transaction is unlocked before using
-		 * btree_node_lock_nopath() (the use of which is always suspect,
-		 * we need to work on removing this in the future)
-		 *
-		 * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
-		 * calls bch2_path_upgrade(), before we call path_make_mut(), so
-		 * we may rarely end up with a locked path besides the one we
-		 * have here:
-		 */
-		bch2_trans_unlock(trans);
-		bch2_trans_begin(trans);
-		btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans,
-						as->btree_id, b->c.level, b->key.k.p);
-		struct btree_path *path = trans->paths + path_idx;
-		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
-		mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED);
-		path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
-		path->l[b->c.level].b = b;
-
-		bch2_btree_node_lock_write_nofail(trans, path, &b->c);
-
-		mutex_lock(&c->btree_interior_update_lock);
-
-		list_del(&as->write_blocked_list);
-		if (list_empty(&b->write_blocked))
-			clear_btree_node_write_blocked(b);
-
-		/*
-		 * Node might have been freed, recheck under
-		 * btree_interior_update_lock:
-		 */
-		if (as->b == b) {
-			BUG_ON(!b->c.level);
-			BUG_ON(!btree_node_dirty(b));
-
-			if (!ret) {
-				struct bset *last = btree_bset_last(b);
-
-				last->journal_seq = cpu_to_le64(
-							     max(journal_seq,
-								 le64_to_cpu(last->journal_seq)));
-
-				bch2_btree_add_journal_pin(c, b, journal_seq);
-			} else {
-				/*
-				 * If we didn't get a journal sequence number we
-				 * can't write this btree node, because recovery
-				 * won't know to ignore this write:
-				 */
-				set_btree_node_never_write(b);
-			}
-		}
-
-		mutex_unlock(&c->btree_interior_update_lock);
-
-		mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
-		six_unlock_write(&b->c.lock);
-
-		btree_node_write_if_need(c, b, SIX_LOCK_intent);
-		btree_node_unlock(trans, path, b->c.level);
-		bch2_path_put(trans, path_idx, true);
-	}
-
-	bch2_journal_pin_drop(&c->journal, &as->journal);
-
-	mutex_lock(&c->btree_interior_update_lock);
-	for (i = 0; i < as->nr_new_nodes; i++) {
-		b = as->new_nodes[i];
-
-		BUG_ON(b->will_make_reachable != (unsigned long) as);
-		b->will_make_reachable = 0;
-		clear_btree_node_will_make_reachable(b);
-	}
-	mutex_unlock(&c->btree_interior_update_lock);
-
-	for (i = 0; i < as->nr_new_nodes; i++) {
-		b = as->new_nodes[i];
-
-		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-		btree_node_write_if_need(c, b, SIX_LOCK_read);
-		six_unlock_read(&b->c.lock);
-	}
-
-	for (i = 0; i < as->nr_open_buckets; i++)
-		bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
-
-	bch2_btree_update_free(as, trans);
-	bch2_trans_put(trans);
-}
-
-static void btree_interior_update_work(struct work_struct *work)
-{
-	struct bch_fs *c =
-		container_of(work, struct bch_fs, btree_interior_update_work);
-	struct btree_update *as;
-
-	while (1) {
-		mutex_lock(&c->btree_interior_update_lock);
-		as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
-					      struct btree_update, unwritten_list);
-		if (as && !as->nodes_written)
-			as = NULL;
-		mutex_unlock(&c->btree_interior_update_lock);
-
-		if (!as)
-			break;
-
-		btree_update_nodes_written(as);
-	}
-}
-
-static CLOSURE_CALLBACK(btree_update_set_nodes_written)
-{
-	closure_type(as, struct btree_update, cl);
-	struct bch_fs *c = as->c;
-
-	mutex_lock(&c->btree_interior_update_lock);
-	as->nodes_written = true;
-	mutex_unlock(&c->btree_interior_update_lock);
-
-	queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
-}
-
-/*
- * We're updating @b with pointers to nodes that haven't finished writing yet:
- * block @b from being written until @as completes
- */
-static void btree_update_updated_node(struct btree_update *as, struct btree *b)
-{
-	struct bch_fs *c = as->c;
-
-	BUG_ON(as->mode != BTREE_UPDATE_none);
-	BUG_ON(as->update_level_end < b->c.level);
-	BUG_ON(!btree_node_dirty(b));
-	BUG_ON(!b->c.level);
-
-	mutex_lock(&c->btree_interior_update_lock);
-	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
-
-	as->mode	= BTREE_UPDATE_node;
-	as->b		= b;
-	as->update_level_end = b->c.level;
-
-	set_btree_node_write_blocked(b);
-	list_add(&as->write_blocked_list, &b->write_blocked);
-
-	mutex_unlock(&c->btree_interior_update_lock);
-}
-
-static int bch2_update_reparent_journal_pin_flush(struct journal *j,
-				struct journal_entry_pin *_pin, u64 seq)
-{
-	return 0;
-}
-
-static void btree_update_reparent(struct btree_update *as,
-				  struct btree_update *child)
-{
-	struct bch_fs *c = as->c;
-
-	lockdep_assert_held(&c->btree_interior_update_lock);
-
-	child->b = NULL;
-	child->mode = BTREE_UPDATE_update;
-
-	bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
-			      bch2_update_reparent_journal_pin_flush);
-}
-
-static void btree_update_updated_root(struct btree_update *as, struct btree *b)
-{
-	struct bkey_i *insert = &b->key;
-	struct bch_fs *c = as->c;
-
-	BUG_ON(as->mode != BTREE_UPDATE_none);
-
-	BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
-	       ARRAY_SIZE(as->journal_entries));
-
-	as->journal_u64s +=
-		journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
-				  BCH_JSET_ENTRY_btree_root,
-				  b->c.btree_id, b->c.level,
-				  insert, insert->k.u64s);
-
-	mutex_lock(&c->btree_interior_update_lock);
-	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
-
-	as->mode	= BTREE_UPDATE_root;
-	mutex_unlock(&c->btree_interior_update_lock);
-}
-
-/*
- * bch2_btree_update_add_new_node:
- *
- * This causes @as to wait on @b to be written, before it gets to
- * bch2_btree_update_nodes_written
- *
- * Additionally, it sets b->will_make_reachable to prevent any additional writes
- * to @b from happening besides the first until @b is reachable on disk
- *
- * And it adds @b to the list of @as's new nodes, so that we can update sector
- * counts in bch2_btree_update_nodes_written:
- */
-static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
-{
-	struct bch_fs *c = as->c;
-
-	closure_get(&as->cl);
-
-	mutex_lock(&c->btree_interior_update_lock);
-	BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
-	BUG_ON(b->will_make_reachable);
-
-	as->new_nodes[as->nr_new_nodes++] = b;
-	b->will_make_reachable = 1UL|(unsigned long) as;
-	set_btree_node_will_make_reachable(b);
-
-	mutex_unlock(&c->btree_interior_update_lock);
-
-	btree_update_add_key(as, &as->new_keys, b);
-
-	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
-		unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data;
-		unsigned sectors = round_up(bytes, block_bytes(c)) >> 9;
-
-		bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
-			cpu_to_le16(sectors);
-	}
-}
-
-/*
- * returns true if @b was a new node
- */
-static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
-{
-	struct btree_update *as;
-	unsigned long v;
-	unsigned i;
-
-	mutex_lock(&c->btree_interior_update_lock);
-	/*
-	 * When b->will_make_reachable != 0, it owns a ref on as->cl that's
-	 * dropped when it gets written by bch2_btree_complete_write - the
-	 * xchg() is for synchronization with bch2_btree_complete_write:
-	 */
-	v = xchg(&b->will_make_reachable, 0);
-	clear_btree_node_will_make_reachable(b);
-	as = (struct btree_update *) (v & ~1UL);
-
-	if (!as) {
-		mutex_unlock(&c->btree_interior_update_lock);
-		return;
-	}
-
-	for (i = 0; i < as->nr_new_nodes; i++)
-		if (as->new_nodes[i] == b)
-			goto found;
-
-	BUG();
-found:
-	array_remove_item(as->new_nodes, as->nr_new_nodes, i);
-	mutex_unlock(&c->btree_interior_update_lock);
-
-	if (v & 1)
-		closure_put(&as->cl);
-}
-
-static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
-{
-	while (b->ob.nr)
-		as->open_buckets[as->nr_open_buckets++] =
-			b->ob.v[--b->ob.nr];
-}
-
-static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j,
-				struct journal_entry_pin *_pin, u64 seq)
-{
-	return 0;
-}
-
-/*
- * @b is being split/rewritten: it may have pointers to not-yet-written btree
- * nodes and thus outstanding btree_updates - redirect @b's
- * btree_updates to point to this btree_update:
- */
-static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
-						      struct btree *b)
-{
-	struct bch_fs *c = as->c;
-	struct btree_update *p, *n;
-	struct btree_write *w;
-
-	set_btree_node_dying(b);
-
-	if (btree_node_fake(b))
-		return;
-
-	mutex_lock(&c->btree_interior_update_lock);
-
-	/*
-	 * Does this node have any btree_update operations preventing
-	 * it from being written?
-	 *
-	 * If so, redirect them to point to this btree_update: we can
-	 * write out our new nodes, but we won't make them visible until those
-	 * operations complete
-	 */
-	list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
-		list_del_init(&p->write_blocked_list);
-		btree_update_reparent(as, p);
-
-		/*
-		 * for flush_held_btree_writes() waiting on updates to flush or
-		 * nodes to be writeable:
-		 */
-		closure_wake_up(&c->btree_interior_update_wait);
-	}
-
-	clear_btree_node_dirty_acct(c, b);
-	clear_btree_node_need_write(b);
-	clear_btree_node_write_blocked(b);
-
-	/*
-	 * Does this node have unwritten data that has a pin on the journal?
-	 *
-	 * If so, transfer that pin to the btree_update operation -
-	 * note that if we're freeing multiple nodes, we only need to keep the
-	 * oldest pin of any of the nodes we're freeing. We'll release the pin
-	 * when the new nodes are persistent and reachable on disk:
-	 */
-	w = btree_current_write(b);
-	bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
-			      bch2_btree_update_will_free_node_journal_pin_flush);
-	bch2_journal_pin_drop(&c->journal, &w->journal);
-
-	w = btree_prev_write(b);
-	bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
-			      bch2_btree_update_will_free_node_journal_pin_flush);
-	bch2_journal_pin_drop(&c->journal, &w->journal);
-
-	mutex_unlock(&c->btree_interior_update_lock);
-
-	/*
-	 * Is this a node that isn't reachable on disk yet?
-	 *
-	 * Nodes that aren't reachable yet have writes blocked until they're
-	 * reachable - now that we've cancelled any pending writes and moved
-	 * things waiting on that write to wait on this update, we can drop this
-	 * node from the list of nodes that the other update is making
-	 * reachable, prior to freeing it:
-	 */
-	btree_update_drop_new_node(c, b);
-
-	btree_update_add_key(as, &as->old_keys, b);
-
-	as->old_nodes[as->nr_old_nodes] = b;
-	as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
-	as->nr_old_nodes++;
-}
-
-static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans)
-{
-	struct bch_fs *c = as->c;
-	u64 start_time = as->start_time;
-
-	BUG_ON(as->mode == BTREE_UPDATE_none);
-
-	if (as->took_gc_lock)
-		up_read(&as->c->gc_lock);
-	as->took_gc_lock = false;
-
-	bch2_btree_reserve_put(as, trans);
-
-	continue_at(&as->cl, btree_update_set_nodes_written,
-		    as->c->btree_interior_update_worker);
-
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
-			       start_time);
-}
-
-static struct btree_update *
-bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
-			unsigned level_start, bool split, unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_update *as;
-	u64 start_time = local_clock();
-	int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
-		? BCH_DISK_RESERVATION_NOFAIL : 0;
-	unsigned nr_nodes[2] = { 0, 0 };
-	unsigned level_end = level_start;
-	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
-	int ret = 0;
-	u32 restart_count = trans->restart_count;
-
-	BUG_ON(!path->should_be_locked);
-
-	if (watermark == BCH_WATERMARK_copygc)
-		watermark = BCH_WATERMARK_btree_copygc;
-	if (watermark < BCH_WATERMARK_btree)
-		watermark = BCH_WATERMARK_btree;
-
-	flags &= ~BCH_WATERMARK_MASK;
-	flags |= watermark;
-
-	if (watermark < BCH_WATERMARK_reclaim &&
-	    test_bit(JOURNAL_space_low, &c->journal.flags)) {
-		if (flags & BCH_TRANS_COMMIT_journal_reclaim)
-			return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock);
-
-		ret = drop_locks_do(trans,
-			({ wait_event(c->journal.wait, !test_bit(JOURNAL_space_low, &c->journal.flags)); 0; }));
-		if (ret)
-			return ERR_PTR(ret);
-	}
-
-	while (1) {
-		nr_nodes[!!level_end] += 1 + split;
-		level_end++;
-
-		ret = bch2_btree_path_upgrade(trans, path, level_end + 1);
-		if (ret)
-			return ERR_PTR(ret);
-
-		if (!btree_path_node(path, level_end)) {
-			/* Allocating new root? */
-			nr_nodes[1] += split;
-			level_end = BTREE_MAX_DEPTH;
-			break;
-		}
-
-		/*
-		 * Always check for space for two keys, even if we won't have to
-		 * split at prior level - it might have been a merge instead:
-		 */
-		if (bch2_btree_node_insert_fits(path->l[level_end].b,
-						BKEY_BTREE_PTR_U64s_MAX * 2))
-			break;
-
-		split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
-	}
-
-	if (!down_read_trylock(&c->gc_lock)) {
-		ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0));
-		if (ret) {
-			up_read(&c->gc_lock);
-			return ERR_PTR(ret);
-		}
-	}
-
-	as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS);
-	memset(as, 0, sizeof(*as));
-	closure_init(&as->cl, NULL);
-	as->c			= c;
-	as->start_time		= start_time;
-	as->ip_started		= _RET_IP_;
-	as->mode		= BTREE_UPDATE_none;
-	as->flags		= flags;
-	as->took_gc_lock	= true;
-	as->btree_id		= path->btree_id;
-	as->update_level_start	= level_start;
-	as->update_level_end	= level_end;
-	INIT_LIST_HEAD(&as->list);
-	INIT_LIST_HEAD(&as->unwritten_list);
-	INIT_LIST_HEAD(&as->write_blocked_list);
-	bch2_keylist_init(&as->old_keys, as->_old_keys);
-	bch2_keylist_init(&as->new_keys, as->_new_keys);
-	bch2_keylist_init(&as->parent_keys, as->inline_keys);
-
-	mutex_lock(&c->btree_interior_update_lock);
-	list_add_tail(&as->list, &c->btree_interior_update_list);
-	mutex_unlock(&c->btree_interior_update_lock);
-
-	/*
-	 * We don't want to allocate if we're in an error state, that can cause
-	 * deadlock on emergency shutdown due to open buckets getting stuck in
-	 * the btree_reserve_cache after allocator shutdown has cleared it out.
-	 * This check needs to come after adding us to the btree_interior_update
-	 * list but before calling bch2_btree_reserve_get, to synchronize with
-	 * __bch2_fs_read_only().
-	 */
-	ret = bch2_journal_error(&c->journal);
-	if (ret)
-		goto err;
-
-	ret = bch2_disk_reservation_get(c, &as->disk_res,
-			(nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
-			c->opts.metadata_replicas,
-			disk_res_flags);
-	if (ret)
-		goto err;
-
-	ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL);
-	if (bch2_err_matches(ret, ENOSPC) ||
-	    bch2_err_matches(ret, ENOMEM)) {
-		struct closure cl;
-
-		/*
-		 * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
-		 * flag
-		 */
-		if (bch2_err_matches(ret, ENOSPC) &&
-		    (flags & BCH_TRANS_COMMIT_journal_reclaim) &&
-		    watermark < BCH_WATERMARK_reclaim) {
-			ret = -BCH_ERR_journal_reclaim_would_deadlock;
-			goto err;
-		}
-
-		closure_init_stack(&cl);
-
-		do {
-			ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
-
-			bch2_trans_unlock(trans);
-			closure_sync(&cl);
-		} while (bch2_err_matches(ret, BCH_ERR_operation_blocked));
-	}
-
-	if (ret) {
-		trace_and_count(c, btree_reserve_get_fail, trans->fn,
-				_RET_IP_, nr_nodes[0] + nr_nodes[1], ret);
-		goto err;
-	}
-
-	ret = bch2_trans_relock(trans);
-	if (ret)
-		goto err;
-
-	bch2_trans_verify_not_restarted(trans, restart_count);
-	return as;
-err:
-	bch2_btree_update_free(as, trans);
-	if (!bch2_err_matches(ret, ENOSPC) &&
-	    !bch2_err_matches(ret, EROFS) &&
-	    ret != -BCH_ERR_journal_reclaim_would_deadlock)
-		bch_err_fn_ratelimited(c, ret);
-	return ERR_PTR(ret);
-}
-
-/* Btree root updates: */
-
-static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
-{
-	/* Root nodes cannot be reaped */
-	mutex_lock(&c->btree_cache.lock);
-	list_del_init(&b->list);
-	mutex_unlock(&c->btree_cache.lock);
-
-	mutex_lock(&c->btree_root_lock);
-	bch2_btree_id_root(c, b->c.btree_id)->b = b;
-	mutex_unlock(&c->btree_root_lock);
-
-	bch2_recalc_btree_reserve(c);
-}
-
-static int bch2_btree_set_root(struct btree_update *as,
-			       struct btree_trans *trans,
-			       struct btree_path *path,
-			       struct btree *b,
-			       bool nofail)
-{
-	struct bch_fs *c = as->c;
-
-	trace_and_count(c, btree_node_set_root, trans, b);
-
-	struct btree *old = btree_node_root(c, b);
-
-	/*
-	 * Ensure no one is using the old root while we switch to the
-	 * new root:
-	 */
-	if (nofail) {
-		bch2_btree_node_lock_write_nofail(trans, path, &old->c);
-	} else {
-		int ret = bch2_btree_node_lock_write(trans, path, &old->c);
-		if (ret)
-			return ret;
-	}
-
-	bch2_btree_set_root_inmem(c, b);
-
-	btree_update_updated_root(as, b);
-
-	/*
-	 * Unlock old root after new root is visible:
-	 *
-	 * The new root isn't persistent, but that's ok: we still have
-	 * an intent lock on the new root, and any updates that would
-	 * depend on the new root would have to update the new root.
-	 */
-	bch2_btree_node_unlock_write(trans, path, old);
-	return 0;
-}
-
-/* Interior node updates: */
-
-static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
-					struct btree_trans *trans,
-					struct btree_path *path,
-					struct btree *b,
-					struct btree_node_iter *node_iter,
-					struct bkey_i *insert)
-{
-	struct bch_fs *c = as->c;
-	struct bkey_packed *k;
-	struct printbuf buf = PRINTBUF;
-	unsigned long old, new, v;
-
-	BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
-	       !btree_ptr_sectors_written(insert));
-
-	if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)))
-		bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
-
-	if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
-			      btree_node_type(b), WRITE, &buf) ?:
-	    bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf)) {
-		printbuf_reset(&buf);
-		prt_printf(&buf, "inserting invalid bkey\n  ");
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-		prt_printf(&buf, "\n  ");
-		bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
-				  btree_node_type(b), WRITE, &buf);
-		bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf);
-
-		bch2_fs_inconsistent(c, "%s", buf.buf);
-		dump_stack();
-	}
-
-	BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
-	       ARRAY_SIZE(as->journal_entries));
-
-	as->journal_u64s +=
-		journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
-				  BCH_JSET_ENTRY_btree_keys,
-				  b->c.btree_id, b->c.level,
-				  insert, insert->k.u64s);
-
-	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
-	       bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
-		bch2_btree_node_iter_advance(node_iter, b);
-
-	bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
-	set_btree_node_dirty_acct(c, b);
-
-	v = READ_ONCE(b->flags);
-	do {
-		old = new = v;
-
-		new &= ~BTREE_WRITE_TYPE_MASK;
-		new |= BTREE_WRITE_interior;
-		new |= 1 << BTREE_NODE_need_write;
-	} while ((v = cmpxchg(&b->flags, old, new)) != old);
-
-	printbuf_exit(&buf);
-}
-
-static void
-bch2_btree_insert_keys_interior(struct btree_update *as,
-				struct btree_trans *trans,
-				struct btree_path *path,
-				struct btree *b,
-				struct btree_node_iter node_iter,
-				struct keylist *keys)
-{
-	struct bkey_i *insert = bch2_keylist_front(keys);
-	struct bkey_packed *k;
-
-	BUG_ON(btree_node_type(b) != BKEY_TYPE_btree);
-
-	while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
-	       (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
-		;
-
-	while (!bch2_keylist_empty(keys)) {
-		insert = bch2_keylist_front(keys);
-
-		if (bpos_gt(insert->k.p, b->key.k.p))
-			break;
-
-		bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
-		bch2_keylist_pop_front(keys);
-	}
-}
-
-/*
- * Move keys from n1 (original replacement node, now lower node) to n2 (higher
- * node)
- */
-static void __btree_split_node(struct btree_update *as,
-			       struct btree_trans *trans,
-			       struct btree *b,
-			       struct btree *n[2])
-{
-	struct bkey_packed *k;
-	struct bpos n1_pos = POS_MIN;
-	struct btree_node_iter iter;
-	struct bset *bsets[2];
-	struct bkey_format_state format[2];
-	struct bkey_packed *out[2];
-	struct bkey uk;
-	unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5;
-	struct { unsigned nr_keys, val_u64s; } nr_keys[2];
-	int i;
-
-	memset(&nr_keys, 0, sizeof(nr_keys));
-
-	for (i = 0; i < 2; i++) {
-		BUG_ON(n[i]->nsets != 1);
-
-		bsets[i] = btree_bset_first(n[i]);
-		out[i] = bsets[i]->start;
-
-		SET_BTREE_NODE_SEQ(n[i]->data, BTREE_NODE_SEQ(b->data) + 1);
-		bch2_bkey_format_init(&format[i]);
-	}
-
-	u64s = 0;
-	for_each_btree_node_key(b, k, &iter) {
-		if (bkey_deleted(k))
-			continue;
-
-		uk = bkey_unpack_key(b, k);
-
-		if (b->c.level &&
-		    u64s < n1_u64s &&
-		    u64s + k->u64s >= n1_u64s &&
-		    bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p))
-			n1_u64s += k->u64s;
-
-		i = u64s >= n1_u64s;
-		u64s += k->u64s;
-		if (!i)
-			n1_pos = uk.p;
-		bch2_bkey_format_add_key(&format[i], &uk);
-
-		nr_keys[i].nr_keys++;
-		nr_keys[i].val_u64s += bkeyp_val_u64s(&b->format, k);
-	}
-
-	btree_set_min(n[0], b->data->min_key);
-	btree_set_max(n[0], n1_pos);
-	btree_set_min(n[1], bpos_successor(n1_pos));
-	btree_set_max(n[1], b->data->max_key);
-
-	for (i = 0; i < 2; i++) {
-		bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key);
-		bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key);
-
-		n[i]->data->format = bch2_bkey_format_done(&format[i]);
-
-		unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s +
-			nr_keys[i].val_u64s;
-		if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b))
-			n[i]->data->format = b->format;
-
-		btree_node_set_format(n[i], n[i]->data->format);
-	}
-
-	u64s = 0;
-	for_each_btree_node_key(b, k, &iter) {
-		if (bkey_deleted(k))
-			continue;
-
-		i = u64s >= n1_u64s;
-		u64s += k->u64s;
-
-		if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k)
-					? &b->format: &bch2_bkey_format_current, k))
-			out[i]->format = KEY_FORMAT_LOCAL_BTREE;
-		else
-			bch2_bkey_unpack(b, (void *) out[i], k);
-
-		out[i]->needs_whiteout = false;
-
-		btree_keys_account_key_add(&n[i]->nr, 0, out[i]);
-		out[i] = bkey_p_next(out[i]);
-	}
-
-	for (i = 0; i < 2; i++) {
-		bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data);
-
-		BUG_ON(!bsets[i]->u64s);
-
-		set_btree_bset_end(n[i], n[i]->set);
-
-		btree_node_reset_sib_u64s(n[i]);
-
-		bch2_verify_btree_nr_keys(n[i]);
-
-		BUG_ON(bch2_btree_node_check_topology(trans, n[i]));
-	}
-}
-
-/*
- * For updates to interior nodes, we've got to do the insert before we split
- * because the stuff we're inserting has to be inserted atomically. Post split,
- * the keys might have to go in different nodes and the split would no longer be
- * atomic.
- *
- * Worse, if the insert is from btree node coalescing, if we do the insert after
- * we do the split (and pick the pivot) - the pivot we pick might be between
- * nodes that were coalesced, and thus in the middle of a child node post
- * coalescing:
- */
-static void btree_split_insert_keys(struct btree_update *as,
-				    struct btree_trans *trans,
-				    btree_path_idx_t path_idx,
-				    struct btree *b,
-				    struct keylist *keys)
-{
-	struct btree_path *path = trans->paths + path_idx;
-
-	if (!bch2_keylist_empty(keys) &&
-	    bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) {
-		struct btree_node_iter node_iter;
-
-		bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
-
-		bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
-
-		BUG_ON(bch2_btree_node_check_topology(trans, b));
-	}
-}
-
-static int btree_split(struct btree_update *as, struct btree_trans *trans,
-		       btree_path_idx_t path, struct btree *b,
-		       struct keylist *keys)
-{
-	struct bch_fs *c = as->c;
-	struct btree *parent = btree_node_parent(trans->paths + path, b);
-	struct btree *n1, *n2 = NULL, *n3 = NULL;
-	btree_path_idx_t path1 = 0, path2 = 0;
-	u64 start_time = local_clock();
-	int ret = 0;
-
-	bch2_verify_btree_nr_keys(b);
-	BUG_ON(!parent && (b != btree_node_root(c, b)));
-	BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1));
-
-	ret = bch2_btree_node_check_topology(trans, b);
-	if (ret)
-		return ret;
-
-	bch2_btree_interior_update_will_free_node(as, b);
-
-	if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
-		struct btree *n[2];
-
-		trace_and_count(c, btree_node_split, trans, b);
-
-		n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level);
-		n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level);
-
-		__btree_split_node(as, trans, b, n);
-
-		if (keys) {
-			btree_split_insert_keys(as, trans, path, n1, keys);
-			btree_split_insert_keys(as, trans, path, n2, keys);
-			BUG_ON(!bch2_keylist_empty(keys));
-		}
-
-		bch2_btree_build_aux_trees(n2);
-		bch2_btree_build_aux_trees(n1);
-
-		bch2_btree_update_add_new_node(as, n1);
-		bch2_btree_update_add_new_node(as, n2);
-		six_unlock_write(&n2->c.lock);
-		six_unlock_write(&n1->c.lock);
-
-		path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p);
-		six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
-		mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
-		bch2_btree_path_level_init(trans, trans->paths + path1, n1);
-
-		path2 = bch2_path_get_unlocked_mut(trans, as->btree_id, n2->c.level, n2->key.k.p);
-		six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
-		mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
-		bch2_btree_path_level_init(trans, trans->paths + path2, n2);
-
-		/*
-		 * Note that on recursive parent_keys == keys, so we
-		 * can't start adding new keys to parent_keys before emptying it
-		 * out (which we did with btree_split_insert_keys() above)
-		 */
-		bch2_keylist_add(&as->parent_keys, &n1->key);
-		bch2_keylist_add(&as->parent_keys, &n2->key);
-
-		if (!parent) {
-			/* Depth increases, make a new root */
-			n3 = __btree_root_alloc(as, trans, b->c.level + 1);
-
-			bch2_btree_update_add_new_node(as, n3);
-			six_unlock_write(&n3->c.lock);
-
-			trans->paths[path2].locks_want++;
-			BUG_ON(btree_node_locked(trans->paths + path2, n3->c.level));
-			six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
-			mark_btree_node_locked(trans, trans->paths + path2, n3->c.level, BTREE_NODE_INTENT_LOCKED);
-			bch2_btree_path_level_init(trans, trans->paths + path2, n3);
-
-			n3->sib_u64s[0] = U16_MAX;
-			n3->sib_u64s[1] = U16_MAX;
-
-			btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
-		}
-	} else {
-		trace_and_count(c, btree_node_compact, trans, b);
-
-		n1 = bch2_btree_node_alloc_replacement(as, trans, b);
-
-		if (keys) {
-			btree_split_insert_keys(as, trans, path, n1, keys);
-			BUG_ON(!bch2_keylist_empty(keys));
-		}
-
-		bch2_btree_build_aux_trees(n1);
-		bch2_btree_update_add_new_node(as, n1);
-		six_unlock_write(&n1->c.lock);
-
-		path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p);
-		six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
-		mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
-		bch2_btree_path_level_init(trans, trans->paths + path1, n1);
-
-		if (parent)
-			bch2_keylist_add(&as->parent_keys, &n1->key);
-	}
-
-	/* New nodes all written, now make them visible: */
-
-	if (parent) {
-		/* Split a non root node */
-		ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
-	} else if (n3) {
-		ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false);
-	} else {
-		/* Root filled up but didn't need to be split */
-		ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false);
-	}
-
-	if (ret)
-		goto err;
-
-	if (n3) {
-		bch2_btree_update_get_open_buckets(as, n3);
-		bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
-	}
-	if (n2) {
-		bch2_btree_update_get_open_buckets(as, n2);
-		bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
-	}
-	bch2_btree_update_get_open_buckets(as, n1);
-	bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
-
-	/*
-	 * The old node must be freed (in memory) _before_ unlocking the new
-	 * nodes - else another thread could re-acquire a read lock on the old
-	 * node after another thread has locked and updated the new node, thus
-	 * seeing stale data:
-	 */
-	bch2_btree_node_free_inmem(trans, trans->paths + path, b);
-
-	if (n3)
-		bch2_trans_node_add(trans, trans->paths + path, n3);
-	if (n2)
-		bch2_trans_node_add(trans, trans->paths + path2, n2);
-	bch2_trans_node_add(trans, trans->paths + path1, n1);
-
-	if (n3)
-		six_unlock_intent(&n3->c.lock);
-	if (n2)
-		six_unlock_intent(&n2->c.lock);
-	six_unlock_intent(&n1->c.lock);
-out:
-	if (path2) {
-		__bch2_btree_path_unlock(trans, trans->paths + path2);
-		bch2_path_put(trans, path2, true);
-	}
-	if (path1) {
-		__bch2_btree_path_unlock(trans, trans->paths + path1);
-		bch2_path_put(trans, path1, true);
-	}
-
-	bch2_trans_verify_locks(trans);
-
-	bch2_time_stats_update(&c->times[n2
-			       ? BCH_TIME_btree_node_split
-			       : BCH_TIME_btree_node_compact],
-			       start_time);
-	return ret;
-err:
-	if (n3)
-		bch2_btree_node_free_never_used(as, trans, n3);
-	if (n2)
-		bch2_btree_node_free_never_used(as, trans, n2);
-	bch2_btree_node_free_never_used(as, trans, n1);
-	goto out;
-}
-
-/**
- * bch2_btree_insert_node - insert bkeys into a given btree node
- *
- * @as:			btree_update object
- * @trans:		btree_trans object
- * @path_idx:		path that points to current node
- * @b:			node to insert keys into
- * @keys:		list of keys to insert
- *
- * Returns: 0 on success, typically transaction restart error on failure
- *
- * Inserts as many keys as it can into a given btree node, splitting it if full.
- * If a split occurred, this function will return early. This can only happen
- * for leaf nodes -- inserts into interior nodes have to be atomic.
- */
-static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
-				  btree_path_idx_t path_idx, struct btree *b,
-				  struct keylist *keys)
-{
-	struct bch_fs *c = as->c;
-	struct btree_path *path = trans->paths + path_idx, *linked;
-	unsigned i;
-	int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
-	int old_live_u64s = b->nr.live_u64s;
-	int live_u64s_added, u64s_added;
-	int ret;
-
-	lockdep_assert_held(&c->gc_lock);
-	BUG_ON(!btree_node_intent_locked(path, b->c.level));
-	BUG_ON(!b->c.level);
-	BUG_ON(!as || as->b);
-	bch2_verify_keylist_sorted(keys);
-
-	ret = bch2_btree_node_lock_write(trans, path, &b->c);
-	if (ret)
-		return ret;
-
-	bch2_btree_node_prep_for_write(trans, path, b);
-
-	if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) {
-		bch2_btree_node_unlock_write(trans, path, b);
-		goto split;
-	}
-
-	ret = bch2_btree_node_check_topology(trans, b);
-	if (ret) {
-		bch2_btree_node_unlock_write(trans, path, b);
-		return ret;
-	}
-
-	bch2_btree_insert_keys_interior(as, trans, path, b,
-					path->l[b->c.level].iter, keys);
-
-	trans_for_each_path_with_node(trans, b, linked, i)
-		bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
-
-	bch2_trans_verify_paths(trans);
-
-	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
-	u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
-
-	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
-		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
-	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
-		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
-
-	if (u64s_added > live_u64s_added &&
-	    bch2_maybe_compact_whiteouts(c, b))
-		bch2_trans_node_reinit_iter(trans, b);
-
-	btree_update_updated_node(as, b);
-	bch2_btree_node_unlock_write(trans, path, b);
-
-	BUG_ON(bch2_btree_node_check_topology(trans, b));
-	return 0;
-split:
-	/*
-	 * We could attempt to avoid the transaction restart, by calling
-	 * bch2_btree_path_upgrade() and allocating more nodes:
-	 */
-	if (b->c.level >= as->update_level_end) {
-		trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b);
-		return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
-	}
-
-	return btree_split(as, trans, path_idx, b, keys);
-}
-
-int bch2_btree_split_leaf(struct btree_trans *trans,
-			  btree_path_idx_t path,
-			  unsigned flags)
-{
-	/* btree_split & merge may both cause paths array to be reallocated */
-	struct btree *b = path_l(trans->paths + path)->b;
-	struct btree_update *as;
-	unsigned l;
-	int ret = 0;
-
-	as = bch2_btree_update_start(trans, trans->paths + path,
-				     trans->paths[path].level,
-				     true, flags);
-	if (IS_ERR(as))
-		return PTR_ERR(as);
-
-	ret = btree_split(as, trans, path, b, NULL);
-	if (ret) {
-		bch2_btree_update_free(as, trans);
-		return ret;
-	}
-
-	bch2_btree_update_done(as, trans);
-
-	for (l = trans->paths[path].level + 1;
-	     btree_node_intent_locked(&trans->paths[path], l) && !ret;
-	     l++)
-		ret = bch2_foreground_maybe_merge(trans, path, l, flags);
-
-	return ret;
-}
-
-static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans,
-				   btree_path_idx_t path_idx)
-{
-	struct bch_fs *c = as->c;
-	struct btree_path *path = trans->paths + path_idx;
-	struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b;
-
-	BUG_ON(!btree_node_locked(path, b->c.level));
-
-	n = __btree_root_alloc(as, trans, b->c.level + 1);
-
-	bch2_btree_update_add_new_node(as, n);
-	six_unlock_write(&n->c.lock);
-
-	path->locks_want++;
-	BUG_ON(btree_node_locked(path, n->c.level));
-	six_lock_increment(&n->c.lock, SIX_LOCK_intent);
-	mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED);
-	bch2_btree_path_level_init(trans, path, n);
-
-	n->sib_u64s[0] = U16_MAX;
-	n->sib_u64s[1] = U16_MAX;
-
-	bch2_keylist_add(&as->parent_keys, &b->key);
-	btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys);
-
-	int ret = bch2_btree_set_root(as, trans, path, n, true);
-	BUG_ON(ret);
-
-	bch2_btree_update_get_open_buckets(as, n);
-	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
-	bch2_trans_node_add(trans, path, n);
-	six_unlock_intent(&n->c.lock);
-
-	mutex_lock(&c->btree_cache.lock);
-	list_add_tail(&b->list, &c->btree_cache.live);
-	mutex_unlock(&c->btree_cache.lock);
-
-	bch2_trans_verify_locks(trans);
-}
-
-int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b;
-
-	if (btree_node_fake(b))
-		return bch2_btree_split_leaf(trans, path, flags);
-
-	struct btree_update *as =
-		bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags);
-	if (IS_ERR(as))
-		return PTR_ERR(as);
-
-	__btree_increase_depth(as, trans, path);
-	bch2_btree_update_done(as, trans);
-	return 0;
-}
-
-int __bch2_foreground_maybe_merge(struct btree_trans *trans,
-				  btree_path_idx_t path,
-				  unsigned level,
-				  unsigned flags,
-				  enum btree_node_sibling sib)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_update *as;
-	struct bkey_format_state new_s;
-	struct bkey_format new_f;
-	struct bkey_i delete;
-	struct btree *b, *m, *n, *prev, *next, *parent;
-	struct bpos sib_pos;
-	size_t sib_u64s;
-	enum btree_id btree = trans->paths[path].btree_id;
-	btree_path_idx_t sib_path = 0, new_path = 0;
-	u64 start_time = local_clock();
-	int ret = 0;
-
-	bch2_trans_verify_not_in_restart(trans);
-	bch2_trans_verify_not_unlocked(trans);
-	BUG_ON(!trans->paths[path].should_be_locked);
-	BUG_ON(!btree_node_locked(&trans->paths[path], level));
-
-	/*
-	 * Work around a deadlock caused by the btree write buffer not doing
-	 * merges and leaving tons of merges for us to do - we really don't need
-	 * to be doing merges at all from the interior update path, and if the
-	 * interior update path is generating too many new interior updates we
-	 * deadlock:
-	 */
-	if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates)
-		return 0;
-
-	if ((flags & BCH_WATERMARK_MASK) <= BCH_WATERMARK_reclaim) {
-		flags &= ~BCH_WATERMARK_MASK;
-		flags |= BCH_WATERMARK_btree;
-		flags |= BCH_TRANS_COMMIT_journal_reclaim;
-	}
-
-	b = trans->paths[path].l[level].b;
-
-	if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) ||
-	    (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) {
-		b->sib_u64s[sib] = U16_MAX;
-		return 0;
-	}
-
-	sib_pos = sib == btree_prev_sib
-		? bpos_predecessor(b->data->min_key)
-		: bpos_successor(b->data->max_key);
-
-	sib_path = bch2_path_get(trans, btree, sib_pos,
-				 U8_MAX, level, BTREE_ITER_intent, _THIS_IP_);
-	ret = bch2_btree_path_traverse(trans, sib_path, false);
-	if (ret)
-		goto err;
-
-	btree_path_set_should_be_locked(trans->paths + sib_path);
-
-	m = trans->paths[sib_path].l[level].b;
-
-	if (btree_node_parent(trans->paths + path, b) !=
-	    btree_node_parent(trans->paths + sib_path, m)) {
-		b->sib_u64s[sib] = U16_MAX;
-		goto out;
-	}
-
-	if (sib == btree_prev_sib) {
-		prev = m;
-		next = b;
-	} else {
-		prev = b;
-		next = m;
-	}
-
-	if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) {
-		struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
-
-		bch2_bpos_to_text(&buf1, prev->data->max_key);
-		bch2_bpos_to_text(&buf2, next->data->min_key);
-		bch_err(c,
-			"%s(): btree topology error:\n"
-			"  prev ends at   %s\n"
-			"  next starts at %s",
-			__func__, buf1.buf, buf2.buf);
-		printbuf_exit(&buf1);
-		printbuf_exit(&buf2);
-		ret = bch2_topology_error(c);
-		goto err;
-	}
-
-	bch2_bkey_format_init(&new_s);
-	bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
-	__bch2_btree_calc_format(&new_s, prev);
-	__bch2_btree_calc_format(&new_s, next);
-	bch2_bkey_format_add_pos(&new_s, next->data->max_key);
-	new_f = bch2_bkey_format_done(&new_s);
-
-	sib_u64s = btree_node_u64s_with_format(b->nr, &b->format, &new_f) +
-		btree_node_u64s_with_format(m->nr, &m->format, &new_f);
-
-	if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
-		sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
-		sib_u64s /= 2;
-		sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
-	}
-
-	sib_u64s = min(sib_u64s, btree_max_u64s(c));
-	sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
-	b->sib_u64s[sib] = sib_u64s;
-
-	if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
-		goto out;
-
-	parent = btree_node_parent(trans->paths + path, b);
-	as = bch2_btree_update_start(trans, trans->paths + path, level, false,
-				     BCH_TRANS_COMMIT_no_enospc|flags);
-	ret = PTR_ERR_OR_ZERO(as);
-	if (ret)
-		goto err;
-
-	trace_and_count(c, btree_node_merge, trans, b);
-
-	bch2_btree_interior_update_will_free_node(as, b);
-	bch2_btree_interior_update_will_free_node(as, m);
-
-	n = bch2_btree_node_alloc(as, trans, b->c.level);
-
-	SET_BTREE_NODE_SEQ(n->data,
-			   max(BTREE_NODE_SEQ(b->data),
-			       BTREE_NODE_SEQ(m->data)) + 1);
-
-	btree_set_min(n, prev->data->min_key);
-	btree_set_max(n, next->data->max_key);
-
-	n->data->format	 = new_f;
-	btree_node_set_format(n, new_f);
-
-	bch2_btree_sort_into(c, n, prev);
-	bch2_btree_sort_into(c, n, next);
-
-	bch2_btree_build_aux_trees(n);
-	bch2_btree_update_add_new_node(as, n);
-	six_unlock_write(&n->c.lock);
-
-	new_path = bch2_path_get_unlocked_mut(trans, btree, n->c.level, n->key.k.p);
-	six_lock_increment(&n->c.lock, SIX_LOCK_intent);
-	mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
-	bch2_btree_path_level_init(trans, trans->paths + new_path, n);
-
-	bkey_init(&delete.k);
-	delete.k.p = prev->key.k.p;
-	bch2_keylist_add(&as->parent_keys, &delete);
-	bch2_keylist_add(&as->parent_keys, &n->key);
-
-	bch2_trans_verify_paths(trans);
-
-	ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
-	if (ret)
-		goto err_free_update;
-
-	bch2_trans_verify_paths(trans);
-
-	bch2_btree_update_get_open_buckets(as, n);
-	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
-
-	bch2_btree_node_free_inmem(trans, trans->paths + path, b);
-	bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m);
-
-	bch2_trans_node_add(trans, trans->paths + path, n);
-
-	bch2_trans_verify_paths(trans);
-
-	six_unlock_intent(&n->c.lock);
-
-	bch2_btree_update_done(as, trans);
-
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
-out:
-err:
-	if (new_path)
-		bch2_path_put(trans, new_path, true);
-	bch2_path_put(trans, sib_path, true);
-	bch2_trans_verify_locks(trans);
-	if (ret == -BCH_ERR_journal_reclaim_would_deadlock)
-		ret = 0;
-	if (!ret)
-		ret = bch2_trans_relock(trans);
-	return ret;
-err_free_update:
-	bch2_btree_node_free_never_used(as, trans, n);
-	bch2_btree_update_free(as, trans);
-	goto out;
-}
-
-int bch2_btree_node_rewrite(struct btree_trans *trans,
-			    struct btree_iter *iter,
-			    struct btree *b,
-			    unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct btree *n, *parent;
-	struct btree_update *as;
-	btree_path_idx_t new_path = 0;
-	int ret;
-
-	flags |= BCH_TRANS_COMMIT_no_enospc;
-
-	struct btree_path *path = btree_iter_path(trans, iter);
-	parent = btree_node_parent(path, b);
-	as = bch2_btree_update_start(trans, path, b->c.level, false, flags);
-	ret = PTR_ERR_OR_ZERO(as);
-	if (ret)
-		goto out;
-
-	bch2_btree_interior_update_will_free_node(as, b);
-
-	n = bch2_btree_node_alloc_replacement(as, trans, b);
-
-	bch2_btree_build_aux_trees(n);
-	bch2_btree_update_add_new_node(as, n);
-	six_unlock_write(&n->c.lock);
-
-	new_path = bch2_path_get_unlocked_mut(trans, iter->btree_id, n->c.level, n->key.k.p);
-	six_lock_increment(&n->c.lock, SIX_LOCK_intent);
-	mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
-	bch2_btree_path_level_init(trans, trans->paths + new_path, n);
-
-	trace_and_count(c, btree_node_rewrite, trans, b);
-
-	if (parent) {
-		bch2_keylist_add(&as->parent_keys, &n->key);
-		ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys);
-	} else {
-		ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false);
-	}
-
-	if (ret)
-		goto err;
-
-	bch2_btree_update_get_open_buckets(as, n);
-	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
-
-	bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b);
-
-	bch2_trans_node_add(trans, trans->paths + iter->path, n);
-	six_unlock_intent(&n->c.lock);
-
-	bch2_btree_update_done(as, trans);
-out:
-	if (new_path)
-		bch2_path_put(trans, new_path, true);
-	bch2_trans_downgrade(trans);
-	return ret;
-err:
-	bch2_btree_node_free_never_used(as, trans, n);
-	bch2_btree_update_free(as, trans);
-	goto out;
-}
-
-struct async_btree_rewrite {
-	struct bch_fs		*c;
-	struct work_struct	work;
-	struct list_head	list;
-	enum btree_id		btree_id;
-	unsigned		level;
-	struct bpos		pos;
-	__le64			seq;
-};
-
-static int async_btree_node_rewrite_trans(struct btree_trans *trans,
-					  struct async_btree_rewrite *a)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct btree *b;
-	int ret;
-
-	bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos,
-				  BTREE_MAX_DEPTH, a->level, 0);
-	b = bch2_btree_iter_peek_node(&iter);
-	ret = PTR_ERR_OR_ZERO(b);
-	if (ret)
-		goto out;
-
-	if (!b || b->data->keys.seq != a->seq) {
-		struct printbuf buf = PRINTBUF;
-
-		if (b)
-			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-		else
-			prt_str(&buf, "(null");
-		bch_info(c, "%s: node to rewrite not found:, searching for seq %llu, got\n%s",
-			 __func__, a->seq, buf.buf);
-		printbuf_exit(&buf);
-		goto out;
-	}
-
-	ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
-out:
-	bch2_trans_iter_exit(trans, &iter);
-
-	return ret;
-}
-
-static void async_btree_node_rewrite_work(struct work_struct *work)
-{
-	struct async_btree_rewrite *a =
-		container_of(work, struct async_btree_rewrite, work);
-	struct bch_fs *c = a->c;
-	int ret;
-
-	ret = bch2_trans_do(c, NULL, NULL, 0,
-		      async_btree_node_rewrite_trans(trans, a));
-	bch_err_fn_ratelimited(c, ret);
-	bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
-	kfree(a);
-}
-
-void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
-{
-	struct async_btree_rewrite *a;
-	int ret;
-
-	a = kmalloc(sizeof(*a), GFP_NOFS);
-	if (!a) {
-		bch_err(c, "%s: error allocating memory", __func__);
-		return;
-	}
-
-	a->c		= c;
-	a->btree_id	= b->c.btree_id;
-	a->level	= b->c.level;
-	a->pos		= b->key.k.p;
-	a->seq		= b->data->keys.seq;
-	INIT_WORK(&a->work, async_btree_node_rewrite_work);
-
-	if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) {
-		mutex_lock(&c->pending_node_rewrites_lock);
-		list_add(&a->list, &c->pending_node_rewrites);
-		mutex_unlock(&c->pending_node_rewrites_lock);
-		return;
-	}
-
-	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
-		if (test_bit(BCH_FS_started, &c->flags)) {
-			bch_err(c, "%s: error getting c->writes ref", __func__);
-			kfree(a);
-			return;
-		}
-
-		ret = bch2_fs_read_write_early(c);
-		bch_err_msg(c, ret, "going read-write");
-		if (ret) {
-			kfree(a);
-			return;
-		}
-
-		bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
-	}
-
-	queue_work(c->btree_node_rewrite_worker, &a->work);
-}
-
-void bch2_do_pending_node_rewrites(struct bch_fs *c)
-{
-	struct async_btree_rewrite *a, *n;
-
-	mutex_lock(&c->pending_node_rewrites_lock);
-	list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) {
-		list_del(&a->list);
-
-		bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
-		queue_work(c->btree_node_rewrite_worker, &a->work);
-	}
-	mutex_unlock(&c->pending_node_rewrites_lock);
-}
-
-void bch2_free_pending_node_rewrites(struct bch_fs *c)
-{
-	struct async_btree_rewrite *a, *n;
-
-	mutex_lock(&c->pending_node_rewrites_lock);
-	list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) {
-		list_del(&a->list);
-
-		kfree(a);
-	}
-	mutex_unlock(&c->pending_node_rewrites_lock);
-}
-
-static int __bch2_btree_node_update_key(struct btree_trans *trans,
-					struct btree_iter *iter,
-					struct btree *b, struct btree *new_hash,
-					struct bkey_i *new_key,
-					unsigned commit_flags,
-					bool skip_triggers)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter2 = { NULL };
-	struct btree *parent;
-	int ret;
-
-	if (!skip_triggers) {
-		ret   = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1,
-					     bkey_i_to_s_c(&b->key),
-					     BTREE_TRIGGER_transactional) ?:
-			bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1,
-					     bkey_i_to_s(new_key),
-					     BTREE_TRIGGER_transactional);
-		if (ret)
-			return ret;
-	}
-
-	if (new_hash) {
-		bkey_copy(&new_hash->key, new_key);
-		ret = bch2_btree_node_hash_insert(&c->btree_cache,
-				new_hash, b->c.level, b->c.btree_id);
-		BUG_ON(ret);
-	}
-
-	parent = btree_node_parent(btree_iter_path(trans, iter), b);
-	if (parent) {
-		bch2_trans_copy_iter(&iter2, iter);
-
-		iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
-				iter2.flags & BTREE_ITER_intent,
-				_THIS_IP_);
-
-		struct btree_path *path2 = btree_iter_path(trans, &iter2);
-		BUG_ON(path2->level != b->c.level);
-		BUG_ON(!bpos_eq(path2->pos, new_key->k.p));
-
-		btree_path_set_level_up(trans, path2);
-
-		trans->paths_sorted = false;
-
-		ret   = bch2_btree_iter_traverse(&iter2) ?:
-			bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun);
-		if (ret)
-			goto err;
-	} else {
-		BUG_ON(btree_node_root(c, b) != b);
-
-		struct jset_entry *e = bch2_trans_jset_entry_alloc(trans,
-				       jset_u64s(new_key->k.u64s));
-		ret = PTR_ERR_OR_ZERO(e);
-		if (ret)
-			return ret;
-
-		journal_entry_set(e,
-				  BCH_JSET_ENTRY_btree_root,
-				  b->c.btree_id, b->c.level,
-				  new_key, new_key->k.u64s);
-	}
-
-	ret = bch2_trans_commit(trans, NULL, NULL, commit_flags);
-	if (ret)
-		goto err;
-
-	bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c);
-
-	if (new_hash) {
-		mutex_lock(&c->btree_cache.lock);
-		bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
-		bch2_btree_node_hash_remove(&c->btree_cache, b);
-
-		bkey_copy(&b->key, new_key);
-		ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
-		BUG_ON(ret);
-		mutex_unlock(&c->btree_cache.lock);
-	} else {
-		bkey_copy(&b->key, new_key);
-	}
-
-	bch2_btree_node_unlock_write(trans, btree_iter_path(trans, iter), b);
-out:
-	bch2_trans_iter_exit(trans, &iter2);
-	return ret;
-err:
-	if (new_hash) {
-		mutex_lock(&c->btree_cache.lock);
-		bch2_btree_node_hash_remove(&c->btree_cache, b);
-		mutex_unlock(&c->btree_cache.lock);
-	}
-	goto out;
-}
-
-int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter,
-			       struct btree *b, struct bkey_i *new_key,
-			       unsigned commit_flags, bool skip_triggers)
-{
-	struct bch_fs *c = trans->c;
-	struct btree *new_hash = NULL;
-	struct btree_path *path = btree_iter_path(trans, iter);
-	struct closure cl;
-	int ret = 0;
-
-	ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1);
-	if (ret)
-		return ret;
-
-	closure_init_stack(&cl);
-
-	/*
-	 * check btree_ptr_hash_val() after @b is locked by
-	 * btree_iter_traverse():
-	 */
-	if (btree_ptr_hash_val(new_key) != b->hash_val) {
-		ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
-		if (ret) {
-			ret = drop_locks_do(trans, (closure_sync(&cl), 0));
-			if (ret)
-				return ret;
-		}
-
-		new_hash = bch2_btree_node_mem_alloc(trans, false);
-	}
-
-	path->intent_ref++;
-	ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, new_key,
-					   commit_flags, skip_triggers);
-	--path->intent_ref;
-
-	if (new_hash) {
-		mutex_lock(&c->btree_cache.lock);
-		list_move(&new_hash->list, &c->btree_cache.freeable);
-		mutex_unlock(&c->btree_cache.lock);
-
-		six_unlock_write(&new_hash->c.lock);
-		six_unlock_intent(&new_hash->c.lock);
-	}
-	closure_sync(&cl);
-	bch2_btree_cache_cannibalize_unlock(trans);
-	return ret;
-}
-
-int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
-					struct btree *b, struct bkey_i *new_key,
-					unsigned commit_flags, bool skip_triggers)
-{
-	struct btree_iter iter;
-	int ret;
-
-	bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
-				  BTREE_MAX_DEPTH, b->c.level,
-				  BTREE_ITER_intent);
-	ret = bch2_btree_iter_traverse(&iter);
-	if (ret)
-		goto out;
-
-	/* has node been freed? */
-	if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) {
-		/* node has been freed: */
-		BUG_ON(!btree_node_dying(b));
-		goto out;
-	}
-
-	BUG_ON(!btree_node_hashed(b));
-
-	bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
-			    !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
-
-	ret = bch2_btree_node_update_key(trans, &iter, b, new_key,
-					 commit_flags, skip_triggers);
-out:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-/* Init code: */
-
-/*
- * Only for filesystem bringup, when first reading the btree roots or allocating
- * btree roots when initializing a new filesystem:
- */
-void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
-{
-	BUG_ON(btree_node_root(c, b));
-
-	bch2_btree_set_root_inmem(c, b);
-}
-
-int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id, unsigned level)
-{
-	struct bch_fs *c = trans->c;
-	struct closure cl;
-	struct btree *b;
-	int ret;
-
-	closure_init_stack(&cl);
-
-	do {
-		ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
-		closure_sync(&cl);
-	} while (ret);
-
-	b = bch2_btree_node_mem_alloc(trans, false);
-	bch2_btree_cache_cannibalize_unlock(trans);
-
-	set_btree_node_fake(b);
-	set_btree_node_need_rewrite(b);
-	b->c.level	= level;
-	b->c.btree_id	= id;
-
-	bkey_btree_ptr_init(&b->key);
-	b->key.k.p = SPOS_MAX;
-	*((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id;
-
-	bch2_bset_init_first(b, &b->data->keys);
-	bch2_btree_build_aux_trees(b);
-
-	b->data->flags = 0;
-	btree_set_min(b, POS_MIN);
-	btree_set_max(b, SPOS_MAX);
-	b->data->format = bch2_btree_calc_format(b);
-	btree_node_set_format(b, b->data->format);
-
-	ret = bch2_btree_node_hash_insert(&c->btree_cache, b,
-					  b->c.level, b->c.btree_id);
-	BUG_ON(ret);
-
-	bch2_btree_set_root_inmem(c, b);
-
-	six_unlock_write(&b->c.lock);
-	six_unlock_intent(&b->c.lock);
-	return 0;
-}
-
-void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level)
-{
-	bch2_trans_run(c, bch2_btree_root_alloc_fake_trans(trans, id, level));
-}
-
-static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
-{
-	prt_printf(out, "%ps: ", (void *) as->ip_started);
-	bch2_trans_commit_flags_to_text(out, as->flags);
-
-	prt_printf(out, " btree=%s l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
-		   bch2_btree_id_str(as->btree_id),
-		   as->update_level_start,
-		   as->update_level_end,
-		   bch2_btree_update_modes[as->mode],
-		   as->nodes_written,
-		   closure_nr_remaining(&as->cl),
-		   as->journal.seq);
-}
-
-void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	struct btree_update *as;
-
-	mutex_lock(&c->btree_interior_update_lock);
-	list_for_each_entry(as, &c->btree_interior_update_list, list)
-		bch2_btree_update_to_text(out, as);
-	mutex_unlock(&c->btree_interior_update_lock);
-}
-
-static bool bch2_btree_interior_updates_pending(struct bch_fs *c)
-{
-	bool ret;
-
-	mutex_lock(&c->btree_interior_update_lock);
-	ret = !list_empty(&c->btree_interior_update_list);
-	mutex_unlock(&c->btree_interior_update_lock);
-
-	return ret;
-}
-
-bool bch2_btree_interior_updates_flush(struct bch_fs *c)
-{
-	bool ret = bch2_btree_interior_updates_pending(c);
-
-	if (ret)
-		closure_wait_event(&c->btree_interior_update_wait,
-				   !bch2_btree_interior_updates_pending(c));
-	return ret;
-}
-
-void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry)
-{
-	struct btree_root *r = bch2_btree_id_root(c, entry->btree_id);
-
-	mutex_lock(&c->btree_root_lock);
-
-	r->level = entry->level;
-	r->alive = true;
-	bkey_copy(&r->key, (struct bkey_i *) entry->start);
-
-	mutex_unlock(&c->btree_root_lock);
-}
-
-struct jset_entry *
-bch2_btree_roots_to_journal_entries(struct bch_fs *c,
-				    struct jset_entry *end,
-				    unsigned long skip)
-{
-	unsigned i;
-
-	mutex_lock(&c->btree_root_lock);
-
-	for (i = 0; i < btree_id_nr_alive(c); i++) {
-		struct btree_root *r = bch2_btree_id_root(c, i);
-
-		if (r->alive && !test_bit(i, &skip)) {
-			journal_entry_set(end, BCH_JSET_ENTRY_btree_root,
-					  i, r->level, &r->key, r->key.k.u64s);
-			end = vstruct_next(end);
-		}
-	}
-
-	mutex_unlock(&c->btree_root_lock);
-
-	return end;
-}
-
-void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
-{
-	if (c->btree_node_rewrite_worker)
-		destroy_workqueue(c->btree_node_rewrite_worker);
-	if (c->btree_interior_update_worker)
-		destroy_workqueue(c->btree_interior_update_worker);
-	mempool_exit(&c->btree_interior_update_pool);
-}
-
-void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
-{
-	mutex_init(&c->btree_reserve_cache_lock);
-	INIT_LIST_HEAD(&c->btree_interior_update_list);
-	INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
-	mutex_init(&c->btree_interior_update_lock);
-	INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);
-
-	INIT_LIST_HEAD(&c->pending_node_rewrites);
-	mutex_init(&c->pending_node_rewrites_lock);
-}
-
-int bch2_fs_btree_interior_update_init(struct bch_fs *c)
-{
-	c->btree_interior_update_worker =
-		alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8);
-	if (!c->btree_interior_update_worker)
-		return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
-
-	c->btree_node_rewrite_worker =
-		alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND);
-	if (!c->btree_node_rewrite_worker)
-		return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
-
-	if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
-				      sizeof(struct btree_update)))
-		return -BCH_ERR_ENOMEM_btree_interior_update_pool_init;
-
-	return 0;
-}
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
deleted file mode 100644
index b5b76ce01cfc..000000000000
--- a/fs/bcachefs/btree_update_interior.h
+++ /dev/null
@@ -1,342 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H
-#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H
-
-#include "btree_cache.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-
-#define BTREE_UPDATE_NODES_MAX		((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES)
-
-#define BTREE_UPDATE_JOURNAL_RES	(BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1))
-
-int bch2_btree_node_check_topology(struct btree_trans *, struct btree *);
-
-#define BTREE_UPDATE_MODES()	\
-	x(none)			\
-	x(node)			\
-	x(root)			\
-	x(update)
-
-enum btree_update_mode {
-#define x(n)	BTREE_UPDATE_##n,
-	BTREE_UPDATE_MODES()
-#undef x
-};
-
-/*
- * Tracks an in progress split/rewrite of a btree node and the update to the
- * parent node:
- *
- * When we split/rewrite a node, we do all the updates in memory without
- * waiting for any writes to complete - we allocate the new node(s) and update
- * the parent node, possibly recursively up to the root.
- *
- * The end result is that we have one or more new nodes being written -
- * possibly several, if there were multiple splits - and then a write (updating
- * an interior node) which will make all these new nodes visible.
- *
- * Additionally, as we split/rewrite nodes we free the old nodes - but the old
- * nodes can't be freed (their space on disk can't be reclaimed) until the
- * update to the interior node that makes the new node visible completes -
- * until then, the old nodes are still reachable on disk.
- *
- */
-struct btree_update {
-	struct closure			cl;
-	struct bch_fs			*c;
-	u64				start_time;
-	unsigned long			ip_started;
-
-	struct list_head		list;
-	struct list_head		unwritten_list;
-
-	enum btree_update_mode		mode;
-	enum bch_trans_commit_flags	flags;
-	unsigned			nodes_written:1;
-	unsigned			took_gc_lock:1;
-
-	enum btree_id			btree_id;
-	unsigned			update_level_start;
-	unsigned			update_level_end;
-
-	struct disk_reservation		disk_res;
-
-	/*
-	 * BTREE_UPDATE_node:
-	 * The update that made the new nodes visible was a regular update to an
-	 * existing interior node - @b. We can't write out the update to @b
-	 * until the new nodes we created are finished writing, so we block @b
-	 * from writing by putting this btree_interior update on the
-	 * @b->write_blocked list with @write_blocked_list:
-	 */
-	struct btree			*b;
-	struct list_head		write_blocked_list;
-
-	/*
-	 * We may be freeing nodes that were dirty, and thus had journal entries
-	 * pinned: we need to transfer the oldest of those pins to the
-	 * btree_update operation, and release it when the new node(s)
-	 * are all persistent and reachable:
-	 */
-	struct journal_entry_pin	journal;
-
-	/* Preallocated nodes we reserve when we start the update: */
-	struct prealloc_nodes {
-		struct btree		*b[BTREE_UPDATE_NODES_MAX];
-		unsigned		nr;
-	}				prealloc_nodes[2];
-
-	/* Nodes being freed: */
-	struct keylist			old_keys;
-	u64				_old_keys[BTREE_UPDATE_NODES_MAX *
-						  BKEY_BTREE_PTR_U64s_MAX];
-
-	/* Nodes being added: */
-	struct keylist			new_keys;
-	u64				_new_keys[BTREE_UPDATE_NODES_MAX *
-						  BKEY_BTREE_PTR_U64s_MAX];
-
-	/* New nodes, that will be made reachable by this update: */
-	struct btree			*new_nodes[BTREE_UPDATE_NODES_MAX];
-	unsigned			nr_new_nodes;
-
-	struct btree			*old_nodes[BTREE_UPDATE_NODES_MAX];
-	__le64				old_nodes_seq[BTREE_UPDATE_NODES_MAX];
-	unsigned			nr_old_nodes;
-
-	open_bucket_idx_t		open_buckets[BTREE_UPDATE_NODES_MAX *
-						     BCH_REPLICAS_MAX];
-	open_bucket_idx_t		nr_open_buckets;
-
-	unsigned			journal_u64s;
-	u64				journal_entries[BTREE_UPDATE_JOURNAL_RES];
-
-	/* Only here to reduce stack usage on recursive splits: */
-	struct keylist			parent_keys;
-	/*
-	 * Enough room for btree_split's keys without realloc - btree node
-	 * pointers never have crc/compression info, so we only need to acount
-	 * for the pointers for three keys
-	 */
-	u64				inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
-};
-
-struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
-						  struct btree_trans *,
-						  struct btree *,
-						  struct bkey_format);
-
-int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned);
-
-int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned);
-
-int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t,
-				  unsigned, unsigned, enum btree_node_sibling);
-
-static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
-					btree_path_idx_t path_idx,
-					unsigned level, unsigned flags,
-					enum btree_node_sibling sib)
-{
-	struct btree_path *path = trans->paths + path_idx;
-	struct btree *b;
-
-	EBUG_ON(!btree_node_locked(path, level));
-
-	if (bch2_btree_node_merging_disabled)
-		return 0;
-
-	b = path->l[level].b;
-	if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold)
-		return 0;
-
-	return __bch2_foreground_maybe_merge(trans, path_idx, level, flags, sib);
-}
-
-static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
-					      btree_path_idx_t path,
-					      unsigned level,
-					      unsigned flags)
-{
-	return  bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
-						    btree_prev_sib) ?:
-		bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
-						    btree_next_sib);
-}
-
-int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
-			    struct btree *, unsigned);
-void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
-int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
-			       struct btree *, struct bkey_i *,
-			       unsigned, bool);
-int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *,
-					struct bkey_i *, unsigned, bool);
-
-void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
-
-int bch2_btree_root_alloc_fake_trans(struct btree_trans *, enum btree_id, unsigned);
-void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned);
-
-static inline unsigned btree_update_reserve_required(struct bch_fs *c,
-						     struct btree *b)
-{
-	unsigned depth = btree_node_root(c, b)->c.level + 1;
-
-	/*
-	 * Number of nodes we might have to allocate in a worst case btree
-	 * split operation - we split all the way up to the root, then allocate
-	 * a new root, unless we're already at max depth:
-	 */
-	if (depth < BTREE_MAX_DEPTH)
-		return (depth - b->c.level) * 2 + 1;
-	else
-		return (depth - b->c.level) * 2 - 1;
-}
-
-static inline void btree_node_reset_sib_u64s(struct btree *b)
-{
-	b->sib_u64s[0] = b->nr.live_u64s;
-	b->sib_u64s[1] = b->nr.live_u64s;
-}
-
-static inline void *btree_data_end(struct btree *b)
-{
-	return (void *) b->data + btree_buf_bytes(b);
-}
-
-static inline struct bkey_packed *unwritten_whiteouts_start(struct btree *b)
-{
-	return (void *) ((u64 *) btree_data_end(b) - b->whiteout_u64s);
-}
-
-static inline struct bkey_packed *unwritten_whiteouts_end(struct btree *b)
-{
-	return btree_data_end(b);
-}
-
-static inline void *write_block(struct btree *b)
-{
-	return (void *) b->data + (b->written << 9);
-}
-
-static inline bool __btree_addr_written(struct btree *b, void *p)
-{
-	return p < write_block(b);
-}
-
-static inline bool bset_written(struct btree *b, struct bset *i)
-{
-	return __btree_addr_written(b, i);
-}
-
-static inline bool bkey_written(struct btree *b, struct bkey_packed *k)
-{
-	return __btree_addr_written(b, k);
-}
-
-static inline ssize_t __bch2_btree_u64s_remaining(struct btree *b, void *end)
-{
-	ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
-		b->whiteout_u64s;
-	ssize_t total = btree_buf_bytes(b) >> 3;
-
-	/* Always leave one extra u64 for bch2_varint_decode: */
-	used++;
-
-	return total - used;
-}
-
-static inline size_t bch2_btree_keys_u64s_remaining(struct btree *b)
-{
-	ssize_t remaining = __bch2_btree_u64s_remaining(b,
-				btree_bkey_last(b, bset_tree_last(b)));
-
-	BUG_ON(remaining < 0);
-
-	if (bset_written(b, btree_bset_last(b)))
-		return 0;
-
-	return remaining;
-}
-
-#define BTREE_WRITE_SET_U64s_BITS	9
-
-static inline unsigned btree_write_set_buffer(struct btree *b)
-{
-	/*
-	 * Could buffer up larger amounts of keys for btrees with larger keys,
-	 * pending benchmarking:
-	 */
-	return 8 << BTREE_WRITE_SET_U64s_BITS;
-}
-
-static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b)
-{
-	struct bset_tree *t = bset_tree_last(b);
-	struct btree_node_entry *bne = max(write_block(b),
-			(void *) btree_bkey_last(b, bset_tree_last(b)));
-	ssize_t remaining_space =
-		__bch2_btree_u64s_remaining(b, bne->keys.start);
-
-	if (unlikely(bset_written(b, bset(b, t)))) {
-		if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
-			return bne;
-	} else {
-		if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) &&
-		    remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
-			return bne;
-	}
-
-	return NULL;
-}
-
-static inline void push_whiteout(struct btree *b, struct bpos pos)
-{
-	struct bkey_packed k;
-
-	BUG_ON(bch2_btree_keys_u64s_remaining(b) < BKEY_U64s);
-	EBUG_ON(btree_node_just_written(b));
-
-	if (!bkey_pack_pos(&k, pos, b)) {
-		struct bkey *u = (void *) &k;
-
-		bkey_init(u);
-		u->p = pos;
-	}
-
-	k.needs_whiteout = true;
-
-	b->whiteout_u64s += k.u64s;
-	bkey_p_copy(unwritten_whiteouts_start(b), &k);
-}
-
-/*
- * write lock must be held on @b (else the dirty bset that we were going to
- * insert into could be written out from under us)
- */
-static inline bool bch2_btree_node_insert_fits(struct btree *b, unsigned u64s)
-{
-	if (unlikely(btree_node_need_rewrite(b)))
-		return false;
-
-	return u64s <= bch2_btree_keys_u64s_remaining(b);
-}
-
-void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
-
-bool bch2_btree_interior_updates_flush(struct bch_fs *);
-
-void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *);
-struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
-					struct jset_entry *, unsigned long);
-
-void bch2_do_pending_node_rewrites(struct bch_fs *);
-void bch2_free_pending_node_rewrites(struct bch_fs *);
-
-void bch2_fs_btree_interior_update_exit(struct bch_fs *);
-void bch2_fs_btree_interior_update_init_early(struct bch_fs *);
-int bch2_fs_btree_interior_update_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
deleted file mode 100644
index 75c8a196b3f6..000000000000
--- a/fs/bcachefs/btree_write_buffer.c
+++ /dev/null
@@ -1,670 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-
-#include <linux/prefetch.h>
-#include <linux/sort.h>
-
-static int bch2_btree_write_buffer_journal_flush(struct journal *,
-				struct journal_entry_pin *, u64);
-
-static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *);
-
-static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
-{
-	return (cmp_int(l->hi, r->hi) ?:
-		cmp_int(l->mi, r->mi) ?:
-		cmp_int(l->lo, r->lo)) >= 0;
-}
-
-static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
-{
-#ifdef CONFIG_X86_64
-	int cmp;
-
-	asm("mov   (%[l]), %%rax;"
-	    "sub   (%[r]), %%rax;"
-	    "mov  8(%[l]), %%rax;"
-	    "sbb  8(%[r]), %%rax;"
-	    "mov 16(%[l]), %%rax;"
-	    "sbb 16(%[r]), %%rax;"
-	    : "=@ccae" (cmp)
-	    : [l] "r" (l), [r] "r" (r)
-	    : "rax", "cc");
-
-	EBUG_ON(cmp != __wb_key_ref_cmp(l, r));
-	return cmp;
-#else
-	return __wb_key_ref_cmp(l, r);
-#endif
-}
-
-static int wb_key_seq_cmp(const void *_l, const void *_r)
-{
-	const struct btree_write_buffered_key *l = _l;
-	const struct btree_write_buffered_key *r = _r;
-
-	return cmp_int(l->journal_seq, r->journal_seq);
-}
-
-/* Compare excluding idx, the low 24 bits: */
-static inline bool wb_key_eq(const void *_l, const void *_r)
-{
-	const struct wb_key_ref *l = _l;
-	const struct wb_key_ref *r = _r;
-
-	return !((l->hi ^ r->hi)|
-		 (l->mi ^ r->mi)|
-		 ((l->lo >> 24) ^ (r->lo >> 24)));
-}
-
-static noinline void wb_sort(struct wb_key_ref *base, size_t num)
-{
-	size_t n = num, a = num / 2;
-
-	if (!a)		/* num < 2 || size == 0 */
-		return;
-
-	for (;;) {
-		size_t b, c, d;
-
-		if (a)			/* Building heap: sift down --a */
-			--a;
-		else if (--n)		/* Sorting: Extract root to --n */
-			swap(base[0], base[n]);
-		else			/* Sort complete */
-			break;
-
-		/*
-		 * Sift element at "a" down into heap.  This is the
-		 * "bottom-up" variant, which significantly reduces
-		 * calls to cmp_func(): we find the sift-down path all
-		 * the way to the leaves (one compare per level), then
-		 * backtrack to find where to insert the target element.
-		 *
-		 * Because elements tend to sift down close to the leaves,
-		 * this uses fewer compares than doing two per level
-		 * on the way down.  (A bit more than half as many on
-		 * average, 3/4 worst-case.)
-		 */
-		for (b = a; c = 2*b + 1, (d = c + 1) < n;)
-			b = wb_key_ref_cmp(base + c, base + d) ? c : d;
-		if (d == n)		/* Special case last leaf with no sibling */
-			b = c;
-
-		/* Now backtrack from "b" to the correct location for "a" */
-		while (b != a && wb_key_ref_cmp(base + a, base + b))
-			b = (b - 1) / 2;
-		c = b;			/* Where "a" belongs */
-		while (b != a) {	/* Shift it into place */
-			b = (b - 1) / 2;
-			swap(base[b], base[c]);
-		}
-	}
-}
-
-static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
-					  struct btree_iter *iter,
-					  struct btree_write_buffered_key *wb)
-{
-	struct btree_path *path = btree_iter_path(trans, iter);
-
-	bch2_btree_node_unlock_write(trans, path, path->l[0].b);
-
-	trans->journal_res.seq = wb->journal_seq;
-
-	return bch2_trans_update(trans, iter, &wb->k,
-				 BTREE_UPDATE_internal_snapshot_node) ?:
-		bch2_trans_commit(trans, NULL, NULL,
-				  BCH_TRANS_COMMIT_no_enospc|
-				  BCH_TRANS_COMMIT_no_check_rw|
-				  BCH_TRANS_COMMIT_no_journal_res|
-				  BCH_TRANS_COMMIT_journal_reclaim);
-}
-
-static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
-			       struct btree_write_buffered_key *wb,
-			       bool *write_locked, size_t *fast)
-{
-	struct btree_path *path;
-	int ret;
-
-	EBUG_ON(!wb->journal_seq);
-	EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq);
-	EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
-
-	ret = bch2_btree_iter_traverse(iter);
-	if (ret)
-		return ret;
-
-	/*
-	 * We can't clone a path that has write locks: unshare it now, before
-	 * set_pos and traverse():
-	 */
-	if (btree_iter_path(trans, iter)->ref > 1)
-		iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_);
-
-	path = btree_iter_path(trans, iter);
-
-	if (!*write_locked) {
-		ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c);
-		if (ret)
-			return ret;
-
-		bch2_btree_node_prep_for_write(trans, path, path->l[0].b);
-		*write_locked = true;
-	}
-
-	if (unlikely(!bch2_btree_node_insert_fits(path->l[0].b, wb->k.k.u64s))) {
-		*write_locked = false;
-		return wb_flush_one_slowpath(trans, iter, wb);
-	}
-
-	bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
-	(*fast)++;
-	return 0;
-}
-
-/*
- * Update a btree with a write buffered key using the journal seq of the
- * original write buffer insert.
- *
- * It is not safe to rejournal the key once it has been inserted into the write
- * buffer because that may break recovery ordering. For example, the key may
- * have already been modified in the active write buffer in a seq that comes
- * before the current transaction. If we were to journal this key again and
- * crash, recovery would process updates in the wrong order.
- */
-static int
-btree_write_buffered_insert(struct btree_trans *trans,
-			  struct btree_write_buffered_key *wb)
-{
-	struct btree_iter iter;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
-			     BTREE_ITER_cached|BTREE_ITER_intent);
-
-	trans->journal_res.seq = wb->journal_seq;
-
-	ret   = bch2_btree_iter_traverse(&iter) ?:
-		bch2_trans_update(trans, &iter, &wb->k,
-				  BTREE_UPDATE_internal_snapshot_node);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb)
-{
-	struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer);
-	struct journal *j = &c->journal;
-
-	if (!wb->inc.keys.nr)
-		return;
-
-	bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin,
-			     bch2_btree_write_buffer_journal_flush);
-
-	darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr));
-	darray_resize(&wb->sorted, wb->flushing.keys.size);
-
-	if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) {
-		swap(wb->flushing.keys, wb->inc.keys);
-		goto out;
-	}
-
-	size_t nr = min(darray_room(wb->flushing.keys),
-			wb->sorted.size - wb->flushing.keys.nr);
-	nr = min(nr, wb->inc.keys.nr);
-
-	memcpy(&darray_top(wb->flushing.keys),
-	       wb->inc.keys.data,
-	       sizeof(wb->inc.keys.data[0]) * nr);
-
-	memmove(wb->inc.keys.data,
-		wb->inc.keys.data + nr,
-	       sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr));
-
-	wb->flushing.keys.nr	+= nr;
-	wb->inc.keys.nr		-= nr;
-out:
-	if (!wb->inc.keys.nr)
-		bch2_journal_pin_drop(j, &wb->inc.pin);
-	else
-		bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin,
-					bch2_btree_write_buffer_journal_flush);
-
-	if (j->watermark) {
-		spin_lock(&j->lock);
-		bch2_journal_set_watermark(j);
-		spin_unlock(&j->lock);
-	}
-
-	BUG_ON(wb->sorted.size < wb->flushing.keys.nr);
-}
-
-static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-	struct journal *j = &c->journal;
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-	struct btree_iter iter = { NULL };
-	size_t skipped = 0, fast = 0, slowpath = 0;
-	bool write_locked = false;
-	int ret = 0;
-
-	bch2_trans_unlock(trans);
-	bch2_trans_begin(trans);
-
-	mutex_lock(&wb->inc.lock);
-	move_keys_from_inc_to_flushing(wb);
-	mutex_unlock(&wb->inc.lock);
-
-	for (size_t i = 0; i < wb->flushing.keys.nr; i++) {
-		wb->sorted.data[i].idx = i;
-		wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree;
-		memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos));
-	}
-	wb->sorted.nr = wb->flushing.keys.nr;
-
-	/*
-	 * We first sort so that we can detect and skip redundant updates, and
-	 * then we attempt to flush in sorted btree order, as this is most
-	 * efficient.
-	 *
-	 * However, since we're not flushing in the order they appear in the
-	 * journal we won't be able to drop our journal pin until everything is
-	 * flushed - which means this could deadlock the journal if we weren't
-	 * passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail
-	 * if it would block taking a journal reservation.
-	 *
-	 * If that happens, simply skip the key so we can optimistically insert
-	 * as many keys as possible in the fast path.
-	 */
-	wb_sort(wb->sorted.data, wb->sorted.nr);
-
-	darray_for_each(wb->sorted, i) {
-		struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
-
-		for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
-			prefetch(&wb->flushing.keys.data[n->idx]);
-
-		BUG_ON(!k->journal_seq);
-
-		if (i + 1 < &darray_top(wb->sorted) &&
-		    wb_key_eq(i, i + 1)) {
-			struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
-
-			skipped++;
-			n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);
-			k->journal_seq = 0;
-			continue;
-		}
-
-		if (write_locked) {
-			struct btree_path *path = btree_iter_path(trans, &iter);
-
-			if (path->btree_id != i->btree ||
-			    bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) {
-				bch2_btree_node_unlock_write(trans, path, path->l[0].b);
-				write_locked = false;
-
-				ret = lockrestart_do(trans,
-					bch2_btree_iter_traverse(&iter) ?:
-					bch2_foreground_maybe_merge(trans, iter.path, 0,
-							BCH_WATERMARK_reclaim|
-							BCH_TRANS_COMMIT_journal_reclaim|
-							BCH_TRANS_COMMIT_no_check_rw|
-							BCH_TRANS_COMMIT_no_enospc));
-				if (ret)
-					goto err;
-			}
-		}
-
-		if (!iter.path || iter.btree_id != k->btree) {
-			bch2_trans_iter_exit(trans, &iter);
-			bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p,
-					     BTREE_ITER_intent|BTREE_ITER_all_snapshots);
-		}
-
-		bch2_btree_iter_set_pos(&iter, k->k.k.p);
-		btree_iter_path(trans, &iter)->preserve = false;
-
-		do {
-			if (race_fault()) {
-				ret = -BCH_ERR_journal_reclaim_would_deadlock;
-				break;
-			}
-
-			ret = wb_flush_one(trans, &iter, k, &write_locked, &fast);
-			if (!write_locked)
-				bch2_trans_begin(trans);
-		} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
-
-		if (!ret) {
-			k->journal_seq = 0;
-		} else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
-			slowpath++;
-			ret = 0;
-		} else
-			break;
-	}
-
-	if (write_locked) {
-		struct btree_path *path = btree_iter_path(trans, &iter);
-		bch2_btree_node_unlock_write(trans, path, path->l[0].b);
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	if (ret)
-		goto err;
-
-	if (slowpath) {
-		/*
-		 * Flush in the order they were present in the journal, so that
-		 * we can release journal pins:
-		 * The fastpath zapped the seq of keys that were successfully flushed so
-		 * we can skip those here.
-		 */
-		trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr);
-
-		sort(wb->flushing.keys.data,
-		     wb->flushing.keys.nr,
-		     sizeof(wb->flushing.keys.data[0]),
-		     wb_key_seq_cmp, NULL);
-
-		darray_for_each(wb->flushing.keys, i) {
-			if (!i->journal_seq)
-				continue;
-
-			bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
-						bch2_btree_write_buffer_journal_flush);
-
-			bch2_trans_begin(trans);
-
-			ret = commit_do(trans, NULL, NULL,
-					BCH_WATERMARK_reclaim|
-					BCH_TRANS_COMMIT_journal_reclaim|
-					BCH_TRANS_COMMIT_no_check_rw|
-					BCH_TRANS_COMMIT_no_enospc|
-					BCH_TRANS_COMMIT_no_journal_res ,
-					btree_write_buffered_insert(trans, i));
-			if (ret)
-				goto err;
-		}
-	}
-err:
-	bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret));
-	trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
-	bch2_journal_pin_drop(j, &wb->flushing.pin);
-	wb->flushing.keys.nr = 0;
-	return ret;
-}
-
-static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq)
-{
-	struct journal *j = &c->journal;
-	struct journal_buf *buf;
-	int ret = 0;
-
-	while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, seq))) {
-		ret = bch2_journal_keys_to_write_buffer(c, buf);
-		mutex_unlock(&j->buf_lock);
-	}
-
-	return ret;
-}
-
-static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-	int ret = 0, fetch_from_journal_err;
-
-	do {
-		bch2_trans_unlock(trans);
-
-		fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq);
-
-		/*
-		 * On memory allocation failure, bch2_btree_write_buffer_flush_locked()
-		 * is not guaranteed to empty wb->inc:
-		 */
-		mutex_lock(&wb->flushing.lock);
-		ret = bch2_btree_write_buffer_flush_locked(trans);
-		mutex_unlock(&wb->flushing.lock);
-	} while (!ret &&
-		 (fetch_from_journal_err ||
-		  (wb->inc.pin.seq && wb->inc.pin.seq <= seq) ||
-		  (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq)));
-
-	return ret;
-}
-
-static int bch2_btree_write_buffer_journal_flush(struct journal *j,
-				struct journal_entry_pin *_pin, u64 seq)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
-	return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq));
-}
-
-int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-
-	trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_);
-
-	return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal));
-}
-
-int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-	int ret = 0;
-
-	if (mutex_trylock(&wb->flushing.lock)) {
-		ret = bch2_btree_write_buffer_flush_locked(trans);
-		mutex_unlock(&wb->flushing.lock);
-	}
-
-	return ret;
-}
-
-int bch2_btree_write_buffer_tryflush(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-
-	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer))
-		return -BCH_ERR_erofs_no_writes;
-
-	int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans);
-	bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
-	return ret;
-}
-
-static void bch2_btree_write_buffer_flush_work(struct work_struct *work)
-{
-	struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work);
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-	int ret;
-
-	mutex_lock(&wb->flushing.lock);
-	do {
-		ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans));
-	} while (!ret && bch2_btree_write_buffer_should_flush(c));
-	mutex_unlock(&wb->flushing.lock);
-
-	bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
-}
-
-int bch2_journal_key_to_wb_slowpath(struct bch_fs *c,
-			     struct journal_keys_to_wb *dst,
-			     enum btree_id btree, struct bkey_i *k)
-{
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-	int ret;
-retry:
-	ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL);
-	if (!ret && dst->wb == &wb->flushing)
-		ret = darray_resize(&wb->sorted, wb->flushing.keys.size);
-
-	if (unlikely(ret)) {
-		if (dst->wb == &c->btree_write_buffer.flushing) {
-			mutex_unlock(&dst->wb->lock);
-			dst->wb = &c->btree_write_buffer.inc;
-			bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin,
-					     bch2_btree_write_buffer_journal_flush);
-			goto retry;
-		}
-
-		return ret;
-	}
-
-	dst->room = darray_room(dst->wb->keys);
-	if (dst->wb == &wb->flushing)
-		dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
-	BUG_ON(!dst->room);
-	BUG_ON(!dst->seq);
-
-	struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
-	wb_k->journal_seq	= dst->seq;
-	wb_k->btree		= btree;
-	bkey_copy(&wb_k->k, k);
-	dst->wb->keys.nr++;
-	dst->room--;
-	return 0;
-}
-
-void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq)
-{
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-	if (mutex_trylock(&wb->flushing.lock)) {
-		mutex_lock(&wb->inc.lock);
-		move_keys_from_inc_to_flushing(wb);
-
-		/*
-		 * Attempt to skip wb->inc, and add keys directly to
-		 * wb->flushing, saving us a copy later:
-		 */
-
-		if (!wb->inc.keys.nr) {
-			dst->wb = &wb->flushing;
-		} else {
-			mutex_unlock(&wb->flushing.lock);
-			dst->wb = &wb->inc;
-		}
-	} else {
-		mutex_lock(&wb->inc.lock);
-		dst->wb = &wb->inc;
-	}
-
-	dst->room = darray_room(dst->wb->keys);
-	if (dst->wb == &wb->flushing)
-		dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
-	dst->seq = seq;
-
-	bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin,
-			     bch2_btree_write_buffer_journal_flush);
-}
-
-void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst)
-{
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-	if (!dst->wb->keys.nr)
-		bch2_journal_pin_drop(&c->journal, &dst->wb->pin);
-
-	if (bch2_btree_write_buffer_should_flush(c) &&
-	    __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) &&
-	    !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work))
-		bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
-
-	if (dst->wb == &wb->flushing)
-		mutex_unlock(&wb->flushing.lock);
-	mutex_unlock(&wb->inc.lock);
-}
-
-static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
-{
-	struct journal_keys_to_wb dst;
-	int ret = 0;
-
-	bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
-
-	for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) {
-		jset_entry_for_each_key(entry, k) {
-			ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k);
-			if (ret)
-				goto out;
-		}
-
-		entry->type = BCH_JSET_ENTRY_btree_keys;
-	}
-
-	spin_lock(&c->journal.lock);
-	buf->need_flush_to_write_buffer = false;
-	spin_unlock(&c->journal.lock);
-out:
-	bch2_journal_keys_to_write_buffer_end(c, &dst);
-	return ret;
-}
-
-static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size)
-{
-	if (wb->keys.size >= new_size)
-		return 0;
-
-	if (!mutex_trylock(&wb->lock))
-		return -EINTR;
-
-	int ret = darray_resize(&wb->keys, new_size);
-	mutex_unlock(&wb->lock);
-	return ret;
-}
-
-int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size)
-{
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-	return wb_keys_resize(&wb->flushing, new_size) ?:
-		wb_keys_resize(&wb->inc, new_size);
-}
-
-void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
-{
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-	BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) &&
-	       !bch2_journal_error(&c->journal));
-
-	darray_exit(&wb->sorted);
-	darray_exit(&wb->flushing.keys);
-	darray_exit(&wb->inc.keys);
-}
-
-int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
-{
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-	mutex_init(&wb->inc.lock);
-	mutex_init(&wb->flushing.lock);
-	INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work);
-
-	/* Will be resized by journal as needed: */
-	unsigned initial_size = 1 << 16;
-
-	return  darray_make_room(&wb->inc.keys, initial_size) ?:
-		darray_make_room(&wb->flushing.keys, initial_size) ?:
-		darray_make_room(&wb->sorted, initial_size);
-}
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
deleted file mode 100644
index eebcd2b15249..000000000000
--- a/fs/bcachefs/btree_write_buffer.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H
-#define _BCACHEFS_BTREE_WRITE_BUFFER_H
-
-#include "bkey.h"
-
-static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c)
-{
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-	return wb->inc.keys.nr + wb->flushing.keys.nr > wb->inc.keys.size / 4;
-}
-
-static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c)
-{
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-	return wb->inc.keys.nr > wb->inc.keys.size * 3 / 4;
-}
-
-struct btree_trans;
-int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
-int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *);
-int bch2_btree_write_buffer_tryflush(struct btree_trans *);
-
-struct journal_keys_to_wb {
-	struct btree_write_buffer_keys	*wb;
-	size_t				room;
-	u64				seq;
-};
-
-int bch2_journal_key_to_wb_slowpath(struct bch_fs *,
-			     struct journal_keys_to_wb *,
-			     enum btree_id, struct bkey_i *);
-
-static inline int bch2_journal_key_to_wb(struct bch_fs *c,
-			     struct journal_keys_to_wb *dst,
-			     enum btree_id btree, struct bkey_i *k)
-{
-	EBUG_ON(!dst->seq);
-
-	if (unlikely(!dst->room))
-		return bch2_journal_key_to_wb_slowpath(c, dst, btree, k);
-
-	struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
-	wb_k->journal_seq	= dst->seq;
-	wb_k->btree		= btree;
-	bkey_copy(&wb_k->k, k);
-	dst->wb->keys.nr++;
-	dst->room--;
-	return 0;
-}
-
-void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64);
-void bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *);
-
-int bch2_btree_write_buffer_resize(struct bch_fs *, size_t);
-void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
-int bch2_fs_btree_write_buffer_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */
diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h
deleted file mode 100644
index 9b9433de9c36..000000000000
--- a/fs/bcachefs/btree_write_buffer_types.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
-#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
-
-#include "darray.h"
-#include "journal_types.h"
-
-#define BTREE_WRITE_BUFERED_VAL_U64s_MAX	4
-#define BTREE_WRITE_BUFERED_U64s_MAX	(BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX)
-
-struct wb_key_ref {
-union {
-	struct {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-		unsigned			idx:24;
-		u8				pos[sizeof(struct bpos)];
-		enum btree_id			btree:8;
-#else
-		enum btree_id			btree:8;
-		u8				pos[sizeof(struct bpos)];
-		unsigned			idx:24;
-#endif
-	} __packed;
-	struct {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-		u64 lo;
-		u64 mi;
-		u64 hi;
-#else
-		u64 hi;
-		u64 mi;
-		u64 lo;
-#endif
-	};
-};
-};
-
-struct btree_write_buffered_key {
-	enum btree_id			btree:8;
-	u64				journal_seq:56;
-	__BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
-};
-
-struct btree_write_buffer_keys {
-	DARRAY(struct btree_write_buffered_key) keys;
-	struct journal_entry_pin	pin;
-	struct mutex			lock;
-};
-
-struct btree_write_buffer {
-	DARRAY(struct wb_key_ref)	sorted;
-	struct btree_write_buffer_keys	inc;
-	struct btree_write_buffer_keys	flushing;
-	struct work_struct		flush_work;
-};
-
-#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
deleted file mode 100644
index e28d28ac2a13..000000000000
--- a/fs/bcachefs/buckets.c
+++ /dev/null
@@ -1,1656 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Code for manipulating bucket marks for garbage collection.
- *
- * Copyright 2014 Datera, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "backpointers.h"
-#include "bset.h"
-#include "btree_gc.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "buckets_waiting_for_journal.h"
-#include "ec.h"
-#include "error.h"
-#include "inode.h"
-#include "movinggc.h"
-#include "recovery.h"
-#include "reflink.h"
-#include "replicas.h"
-#include "subvolume.h"
-#include "trace.h"
-
-#include <linux/preempt.h>
-
-static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
-					      enum bch_data_type data_type,
-					      s64 sectors)
-{
-	switch (data_type) {
-	case BCH_DATA_btree:
-		fs_usage->btree		+= sectors;
-		break;
-	case BCH_DATA_user:
-	case BCH_DATA_parity:
-		fs_usage->data		+= sectors;
-		break;
-	case BCH_DATA_cached:
-		fs_usage->cached	+= sectors;
-		break;
-	default:
-		break;
-	}
-}
-
-void bch2_fs_usage_initialize(struct bch_fs *c)
-{
-	percpu_down_write(&c->mark_lock);
-	struct bch_fs_usage *usage = c->usage_base;
-
-	for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++)
-		bch2_fs_usage_acc_to_base(c, i);
-
-	for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++)
-		usage->b.reserved += usage->persistent_reserved[i];
-
-	for (unsigned i = 0; i < c->replicas.nr; i++) {
-		struct bch_replicas_entry_v1 *e =
-			cpu_replicas_entry(&c->replicas, i);
-
-		fs_usage_data_type_to_base(&usage->b, e->data_type, usage->replicas[i]);
-	}
-
-	for_each_member_device(c, ca) {
-		struct bch_dev_usage dev = bch2_dev_usage_read(ca);
-
-		usage->b.hidden += (dev.d[BCH_DATA_sb].buckets +
-				    dev.d[BCH_DATA_journal].buckets) *
-			ca->mi.bucket_size;
-	}
-
-	percpu_up_write(&c->mark_lock);
-}
-
-static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
-						  unsigned journal_seq,
-						  bool gc)
-{
-	BUG_ON(!gc && !journal_seq);
-
-	return this_cpu_ptr(gc
-			    ? ca->usage_gc
-			    : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
-}
-
-void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
-{
-	struct bch_fs *c = ca->fs;
-	unsigned seq, i, u64s = dev_usage_u64s();
-
-	do {
-		seq = read_seqcount_begin(&c->usage_lock);
-		memcpy(usage, ca->usage_base, u64s * sizeof(u64));
-		for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
-			acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage[i], u64s);
-	} while (read_seqcount_retry(&c->usage_lock, seq));
-}
-
-u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
-{
-	ssize_t offset = v - (u64 *) c->usage_base;
-	unsigned i, seq;
-	u64 ret;
-
-	BUG_ON(offset < 0 || offset >= fs_usage_u64s(c));
-	percpu_rwsem_assert_held(&c->mark_lock);
-
-	do {
-		seq = read_seqcount_begin(&c->usage_lock);
-		ret = *v;
-
-		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
-			ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset);
-	} while (read_seqcount_retry(&c->usage_lock, seq));
-
-	return ret;
-}
-
-struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
-{
-	struct bch_fs_usage_online *ret;
-	unsigned nr_replicas = READ_ONCE(c->replicas.nr);
-	unsigned seq, i;
-retry:
-	ret = kmalloc(__fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_KERNEL);
-	if (unlikely(!ret))
-		return NULL;
-
-	percpu_down_read(&c->mark_lock);
-
-	if (nr_replicas != c->replicas.nr) {
-		nr_replicas = c->replicas.nr;
-		percpu_up_read(&c->mark_lock);
-		kfree(ret);
-		goto retry;
-	}
-
-	ret->online_reserved = percpu_u64_get(c->online_reserved);
-
-	do {
-		seq = read_seqcount_begin(&c->usage_lock);
-		unsafe_memcpy(&ret->u, c->usage_base,
-			      __fs_usage_u64s(nr_replicas) * sizeof(u64),
-			      "embedded variable length struct");
-		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
-			acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i],
-					__fs_usage_u64s(nr_replicas));
-	} while (read_seqcount_retry(&c->usage_lock, seq));
-
-	return ret;
-}
-
-void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
-{
-	unsigned u64s = fs_usage_u64s(c);
-
-	BUG_ON(idx >= ARRAY_SIZE(c->usage));
-
-	preempt_disable();
-	write_seqcount_begin(&c->usage_lock);
-
-	acc_u64s_percpu((u64 *) c->usage_base,
-			(u64 __percpu *) c->usage[idx], u64s);
-	percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
-
-	rcu_read_lock();
-	for_each_member_device_rcu(c, ca, NULL) {
-		u64s = dev_usage_u64s();
-
-		acc_u64s_percpu((u64 *) ca->usage_base,
-				(u64 __percpu *) ca->usage[idx], u64s);
-		percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
-	}
-	rcu_read_unlock();
-
-	write_seqcount_end(&c->usage_lock);
-	preempt_enable();
-}
-
-void bch2_fs_usage_to_text(struct printbuf *out,
-			   struct bch_fs *c,
-			   struct bch_fs_usage_online *fs_usage)
-{
-	unsigned i;
-
-	prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity);
-
-	prt_printf(out, "hidden:\t\t\t\t%llu\n",
-	       fs_usage->u.b.hidden);
-	prt_printf(out, "data:\t\t\t\t%llu\n",
-	       fs_usage->u.b.data);
-	prt_printf(out, "cached:\t\t\t\t%llu\n",
-	       fs_usage->u.b.cached);
-	prt_printf(out, "reserved:\t\t\t%llu\n",
-	       fs_usage->u.b.reserved);
-	prt_printf(out, "nr_inodes:\t\t\t%llu\n",
-	       fs_usage->u.b.nr_inodes);
-	prt_printf(out, "online reserved:\t\t%llu\n",
-	       fs_usage->online_reserved);
-
-	for (i = 0;
-	     i < ARRAY_SIZE(fs_usage->u.persistent_reserved);
-	     i++) {
-		prt_printf(out, "%u replicas:\n", i + 1);
-		prt_printf(out, "\treserved:\t\t%llu\n",
-		       fs_usage->u.persistent_reserved[i]);
-	}
-
-	for (i = 0; i < c->replicas.nr; i++) {
-		struct bch_replicas_entry_v1 *e =
-			cpu_replicas_entry(&c->replicas, i);
-
-		prt_printf(out, "\t");
-		bch2_replicas_entry_to_text(out, e);
-		prt_printf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
-	}
-}
-
-static u64 reserve_factor(u64 r)
-{
-	return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
-}
-
-u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
-{
-	return min(fs_usage->u.b.hidden +
-		   fs_usage->u.b.btree +
-		   fs_usage->u.b.data +
-		   reserve_factor(fs_usage->u.b.reserved +
-				  fs_usage->online_reserved),
-		   c->capacity);
-}
-
-static struct bch_fs_usage_short
-__bch2_fs_usage_read_short(struct bch_fs *c)
-{
-	struct bch_fs_usage_short ret;
-	u64 data, reserved;
-
-	ret.capacity = c->capacity -
-		bch2_fs_usage_read_one(c, &c->usage_base->b.hidden);
-
-	data		= bch2_fs_usage_read_one(c, &c->usage_base->b.data) +
-		bch2_fs_usage_read_one(c, &c->usage_base->b.btree);
-	reserved	= bch2_fs_usage_read_one(c, &c->usage_base->b.reserved) +
-		percpu_u64_get(c->online_reserved);
-
-	ret.used	= min(ret.capacity, data + reserve_factor(reserved));
-	ret.free	= ret.capacity - ret.used;
-
-	ret.nr_inodes	= bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes);
-
-	return ret;
-}
-
-struct bch_fs_usage_short
-bch2_fs_usage_read_short(struct bch_fs *c)
-{
-	struct bch_fs_usage_short ret;
-
-	percpu_down_read(&c->mark_lock);
-	ret = __bch2_fs_usage_read_short(c);
-	percpu_up_read(&c->mark_lock);
-
-	return ret;
-}
-
-void bch2_dev_usage_init(struct bch_dev *ca)
-{
-	ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket;
-}
-
-void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage)
-{
-	prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n");
-
-	for (unsigned i = 0; i < BCH_DATA_NR; i++) {
-		bch2_prt_data_type(out, i);
-		prt_printf(out, "\t%llu\r%llu\r%llu\r\n",
-			usage->d[i].buckets,
-			usage->d[i].sectors,
-			usage->d[i].fragmented);
-	}
-}
-
-void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
-			   const struct bch_alloc_v4 *old,
-			   const struct bch_alloc_v4 *new,
-			   u64 journal_seq, bool gc)
-{
-	struct bch_fs_usage *fs_usage;
-	struct bch_dev_usage *u;
-
-	preempt_disable();
-	fs_usage = fs_usage_ptr(c, journal_seq, gc);
-
-	if (data_type_is_hidden(old->data_type))
-		fs_usage->b.hidden -= ca->mi.bucket_size;
-	if (data_type_is_hidden(new->data_type))
-		fs_usage->b.hidden += ca->mi.bucket_size;
-
-	u = dev_usage_ptr(ca, journal_seq, gc);
-
-	u->d[old->data_type].buckets--;
-	u->d[new->data_type].buckets++;
-
-	u->d[old->data_type].sectors -= bch2_bucket_sectors_dirty(*old);
-	u->d[new->data_type].sectors += bch2_bucket_sectors_dirty(*new);
-
-	u->d[BCH_DATA_cached].sectors += new->cached_sectors;
-	u->d[BCH_DATA_cached].sectors -= old->cached_sectors;
-
-	u->d[old->data_type].fragmented -= bch2_bucket_sectors_fragmented(ca, *old);
-	u->d[new->data_type].fragmented += bch2_bucket_sectors_fragmented(ca, *new);
-
-	preempt_enable();
-}
-
-static inline int __update_replicas(struct bch_fs *c,
-				    struct bch_fs_usage *fs_usage,
-				    struct bch_replicas_entry_v1 *r,
-				    s64 sectors)
-{
-	int idx = bch2_replicas_entry_idx(c, r);
-
-	if (idx < 0)
-		return -1;
-
-	fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
-	fs_usage->replicas[idx]		+= sectors;
-	return 0;
-}
-
-int bch2_update_replicas(struct bch_fs *c, struct bkey_s_c k,
-			 struct bch_replicas_entry_v1 *r, s64 sectors,
-			 unsigned journal_seq, bool gc)
-{
-	struct bch_fs_usage *fs_usage;
-	int idx, ret = 0;
-	struct printbuf buf = PRINTBUF;
-
-	percpu_down_read(&c->mark_lock);
-
-	idx = bch2_replicas_entry_idx(c, r);
-	if (idx < 0 &&
-	    fsck_err(c, ptr_to_missing_replicas_entry,
-		     "no replicas entry\n  while marking %s",
-		     (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		percpu_up_read(&c->mark_lock);
-		ret = bch2_mark_replicas(c, r);
-		percpu_down_read(&c->mark_lock);
-
-		if (ret)
-			goto err;
-		idx = bch2_replicas_entry_idx(c, r);
-	}
-	if (idx < 0) {
-		ret = -1;
-		goto err;
-	}
-
-	preempt_disable();
-	fs_usage = fs_usage_ptr(c, journal_seq, gc);
-	fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
-	fs_usage->replicas[idx]		+= sectors;
-	preempt_enable();
-err:
-fsck_err:
-	percpu_up_read(&c->mark_lock);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static inline int update_cached_sectors(struct bch_fs *c,
-			struct bkey_s_c k,
-			unsigned dev, s64 sectors,
-			unsigned journal_seq, bool gc)
-{
-	struct bch_replicas_padded r;
-
-	bch2_replicas_entry_cached(&r.e, dev);
-
-	return bch2_update_replicas(c, k, &r.e, sectors, journal_seq, gc);
-}
-
-static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more,
-				     gfp_t gfp)
-{
-	struct replicas_delta_list *d = trans->fs_usage_deltas;
-	unsigned new_size = d ? (d->size + more) * 2 : 128;
-	unsigned alloc_size = sizeof(*d) + new_size;
-
-	WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX);
-
-	if (!d || d->used + more > d->size) {
-		d = krealloc(d, alloc_size, gfp|__GFP_ZERO);
-
-		if (unlikely(!d)) {
-			if (alloc_size > REPLICAS_DELTA_LIST_MAX)
-				return -ENOMEM;
-
-			d = mempool_alloc(&trans->c->replicas_delta_pool, gfp);
-			if (!d)
-				return -ENOMEM;
-
-			memset(d, 0, REPLICAS_DELTA_LIST_MAX);
-
-			if (trans->fs_usage_deltas)
-				memcpy(d, trans->fs_usage_deltas,
-				       trans->fs_usage_deltas->size + sizeof(*d));
-
-			new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d);
-			kfree(trans->fs_usage_deltas);
-		}
-
-		d->size = new_size;
-		trans->fs_usage_deltas = d;
-	}
-
-	return 0;
-}
-
-int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
-{
-	return allocate_dropping_locks_errcode(trans,
-				__replicas_deltas_realloc(trans, more, _gfp));
-}
-
-int bch2_update_replicas_list(struct btree_trans *trans,
-			 struct bch_replicas_entry_v1 *r,
-			 s64 sectors)
-{
-	struct replicas_delta_list *d;
-	struct replicas_delta *n;
-	unsigned b;
-	int ret;
-
-	if (!sectors)
-		return 0;
-
-	b = replicas_entry_bytes(r) + 8;
-	ret = bch2_replicas_deltas_realloc(trans, b);
-	if (ret)
-		return ret;
-
-	d = trans->fs_usage_deltas;
-	n = (void *) d->d + d->used;
-	n->delta = sectors;
-	unsafe_memcpy((void *) n + offsetof(struct replicas_delta, r),
-		      r, replicas_entry_bytes(r),
-		      "flexible array member embedded in strcuct with padding");
-	bch2_replicas_entry_sort(&n->r);
-	d->used += b;
-	return 0;
-}
-
-int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 sectors)
-{
-	struct bch_replicas_padded r;
-
-	bch2_replicas_entry_cached(&r.e, dev);
-
-	return bch2_update_replicas_list(trans, &r.e, sectors);
-}
-
-int bch2_check_fix_ptrs(struct btree_trans *trans,
-			enum btree_id btree, unsigned level, struct bkey_s_c k,
-			enum btree_iter_update_trigger_flags flags)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry_c;
-	struct extent_ptr_decoded p = { 0 };
-	bool do_update = false;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	percpu_down_read(&c->mark_lock);
-
-	rcu_read_lock();
-	bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
-		struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
-		if (!ca) {
-			if (fsck_err(c, ptr_to_invalid_device,
-				     "pointer to missing device %u\n"
-				     "while marking %s",
-				     p.ptr.dev,
-				     (printbuf_reset(&buf),
-				      bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-				do_update = true;
-			continue;
-		}
-
-		struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
-		enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry_c);
-
-		if (fsck_err_on(!g->gen_valid,
-				c, ptr_to_missing_alloc_key,
-				"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
-				"while marking %s",
-				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-				bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
-				p.ptr.gen,
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-			if (!p.ptr.cached) {
-				g->gen_valid		= true;
-				g->gen			= p.ptr.gen;
-			} else {
-				do_update = true;
-			}
-		}
-
-		if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
-				c, ptr_gen_newer_than_bucket_gen,
-				"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
-				"while marking %s",
-				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-				bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
-				p.ptr.gen, g->gen,
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-			if (!p.ptr.cached &&
-			    (g->data_type != BCH_DATA_btree ||
-			     data_type == BCH_DATA_btree)) {
-				g->gen_valid		= true;
-				g->gen			= p.ptr.gen;
-				g->data_type		= 0;
-				g->dirty_sectors	= 0;
-				g->cached_sectors	= 0;
-			} else {
-				do_update = true;
-			}
-		}
-
-		if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
-				c, ptr_gen_newer_than_bucket_gen,
-				"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
-				"while marking %s",
-				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
-				bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
-				p.ptr.gen,
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-			do_update = true;
-
-		if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
-				c, stale_dirty_ptr,
-				"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
-				"while marking %s",
-				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-				bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
-				p.ptr.gen, g->gen,
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-			do_update = true;
-
-		if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
-			continue;
-
-		if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type),
-				c, ptr_bucket_data_type_mismatch,
-				"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
-				"while marking %s",
-				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
-				bch2_data_type_str(g->data_type),
-				bch2_data_type_str(data_type),
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-			if (data_type == BCH_DATA_btree) {
-				g->gen_valid		= true;
-				g->gen			= p.ptr.gen;
-				g->data_type		= data_type;
-				g->dirty_sectors	= 0;
-				g->cached_sectors	= 0;
-			} else {
-				do_update = true;
-			}
-		}
-
-		if (p.has_ec) {
-			struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
-
-			if (fsck_err_on(!m || !m->alive, c,
-					ptr_to_missing_stripe,
-					"pointer to nonexistent stripe %llu\n"
-					"while marking %s",
-					(u64) p.ec.idx,
-					(printbuf_reset(&buf),
-					 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-				do_update = true;
-
-			if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c,
-					ptr_to_incorrect_stripe,
-					"pointer does not match stripe %llu\n"
-					"while marking %s",
-					(u64) p.ec.idx,
-					(printbuf_reset(&buf),
-					 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-				do_update = true;
-		}
-	}
-	rcu_read_unlock();
-
-	if (do_update) {
-		if (flags & BTREE_TRIGGER_is_root) {
-			bch_err(c, "cannot update btree roots yet");
-			ret = -EINVAL;
-			goto err;
-		}
-
-		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
-		ret = PTR_ERR_OR_ZERO(new);
-		if (ret)
-			goto err;
-
-		rcu_read_lock();
-		bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_rcu(c, ptr->dev));
-		rcu_read_unlock();
-
-		if (level) {
-			/*
-			 * We don't want to drop btree node pointers - if the
-			 * btree node isn't there anymore, the read path will
-			 * sort it out:
-			 */
-			struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
-			rcu_read_lock();
-			bkey_for_each_ptr(ptrs, ptr) {
-				struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
-				struct bucket *g = PTR_GC_BUCKET(ca, ptr);
-
-				ptr->gen = g->gen;
-			}
-			rcu_read_unlock();
-		} else {
-			struct bkey_ptrs ptrs;
-			union bch_extent_entry *entry;
-restart_drop_ptrs:
-			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
-			rcu_read_lock();
-			bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) {
-				struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
-				struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
-				enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry);
-
-				if ((p.ptr.cached &&
-				     (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) ||
-				    (!p.ptr.cached &&
-				     gen_cmp(p.ptr.gen, g->gen) < 0) ||
-				    gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX ||
-				    (g->data_type &&
-				     g->data_type != data_type)) {
-					bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr);
-					goto restart_drop_ptrs;
-				}
-			}
-			rcu_read_unlock();
-again:
-			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
-			bkey_extent_entry_for_each(ptrs, entry) {
-				if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
-					struct gc_stripe *m = genradix_ptr(&c->gc_stripes,
-									entry->stripe_ptr.idx);
-					union bch_extent_entry *next_ptr;
-
-					bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
-						if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
-							goto found;
-					next_ptr = NULL;
-found:
-					if (!next_ptr) {
-						bch_err(c, "aieee, found stripe ptr with no data ptr");
-						continue;
-					}
-
-					if (!m || !m->alive ||
-					    !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
-								       &next_ptr->ptr,
-								       m->sectors)) {
-						bch2_bkey_extent_entry_drop(new, entry);
-						goto again;
-					}
-				}
-			}
-		}
-
-		if (0) {
-			printbuf_reset(&buf);
-			bch2_bkey_val_to_text(&buf, c, k);
-			bch_info(c, "updated %s", buf.buf);
-
-			printbuf_reset(&buf);
-			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
-			bch_info(c, "new key %s", buf.buf);
-		}
-
-		percpu_up_read(&c->mark_lock);
-		struct btree_iter iter;
-		bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
-					  BTREE_ITER_intent|BTREE_ITER_all_snapshots);
-		ret =   bch2_btree_iter_traverse(&iter) ?:
-			bch2_trans_update(trans, &iter, new,
-					  BTREE_UPDATE_internal_snapshot_node|
-					  BTREE_TRIGGER_norun);
-		bch2_trans_iter_exit(trans, &iter);
-		percpu_down_read(&c->mark_lock);
-
-		if (ret)
-			goto err;
-
-		if (level)
-			bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
-	}
-err:
-fsck_err:
-	percpu_up_read(&c->mark_lock);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
-			   struct bkey_s_c k,
-			   const struct bch_extent_ptr *ptr,
-			   s64 sectors, enum bch_data_type ptr_data_type,
-			   u8 b_gen, u8 bucket_data_type,
-			   u32 *bucket_sectors)
-{
-	struct bch_fs *c = trans->c;
-	size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
-	struct printbuf buf = PRINTBUF;
-	bool inserting = sectors > 0;
-	int ret = 0;
-
-	BUG_ON(!sectors);
-
-	if (gen_after(ptr->gen, b_gen)) {
-		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			      BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen,
-			"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
-			"while marking %s",
-			ptr->dev, bucket_nr, b_gen,
-			bch2_data_type_str(bucket_data_type ?: ptr_data_type),
-			ptr->gen,
-			(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
-		if (inserting)
-			goto err;
-		goto out;
-	}
-
-	if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
-		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			      BCH_FSCK_ERR_ptr_too_stale,
-			"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
-			"while marking %s",
-			ptr->dev, bucket_nr, b_gen,
-			bch2_data_type_str(bucket_data_type ?: ptr_data_type),
-			ptr->gen,
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
-		if (inserting)
-			goto err;
-		goto out;
-	}
-
-	if (b_gen != ptr->gen && ptr->cached) {
-		ret = 1;
-		goto out;
-	}
-
-	if (b_gen != ptr->gen) {
-		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			      BCH_FSCK_ERR_stale_dirty_ptr,
-			"bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
-			"while marking %s",
-			ptr->dev, bucket_nr, b_gen,
-			*bucket_gen(ca, bucket_nr),
-			bch2_data_type_str(bucket_data_type ?: ptr_data_type),
-			ptr->gen,
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
-		if (inserting)
-			goto err;
-		goto out;
-	}
-
-	if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) {
-		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			      BCH_FSCK_ERR_ptr_bucket_data_type_mismatch,
-			"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
-			"while marking %s",
-			ptr->dev, bucket_nr, b_gen,
-			bch2_data_type_str(bucket_data_type),
-			bch2_data_type_str(ptr_data_type),
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
-		if (inserting)
-			goto err;
-		goto out;
-	}
-
-	if ((u64) *bucket_sectors + sectors > U32_MAX) {
-		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			      BCH_FSCK_ERR_bucket_sector_count_overflow,
-			"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
-			"while marking %s",
-			ptr->dev, bucket_nr, b_gen,
-			bch2_data_type_str(bucket_data_type ?: ptr_data_type),
-			*bucket_sectors, sectors,
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
-		if (inserting)
-			goto err;
-		sectors = -*bucket_sectors;
-	}
-
-	*bucket_sectors += sectors;
-out:
-	printbuf_exit(&buf);
-	return ret;
-err:
-	bch2_dump_trans_updates(trans);
-	ret = -EIO;
-	goto out;
-}
-
-void bch2_trans_fs_usage_revert(struct btree_trans *trans,
-				struct replicas_delta_list *deltas)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_fs_usage *dst;
-	struct replicas_delta *d, *top = (void *) deltas->d + deltas->used;
-	s64 added = 0;
-	unsigned i;
-
-	percpu_down_read(&c->mark_lock);
-	preempt_disable();
-	dst = fs_usage_ptr(c, trans->journal_res.seq, false);
-
-	/* revert changes: */
-	for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
-		switch (d->r.data_type) {
-		case BCH_DATA_btree:
-		case BCH_DATA_user:
-		case BCH_DATA_parity:
-			added += d->delta;
-		}
-		BUG_ON(__update_replicas(c, dst, &d->r, -d->delta));
-	}
-
-	dst->b.nr_inodes -= deltas->nr_inodes;
-
-	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
-		added				-= deltas->persistent_reserved[i];
-		dst->b.reserved			-= deltas->persistent_reserved[i];
-		dst->persistent_reserved[i]	-= deltas->persistent_reserved[i];
-	}
-
-	if (added > 0) {
-		trans->disk_res->sectors += added;
-		this_cpu_add(*c->online_reserved, added);
-	}
-
-	preempt_enable();
-	percpu_up_read(&c->mark_lock);
-}
-
-void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-	u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
-	static int warned_disk_usage = 0;
-	bool warn = false;
-
-	percpu_down_read(&c->mark_lock);
-	preempt_disable();
-	struct bch_fs_usage_base *dst = &fs_usage_ptr(c, trans->journal_res.seq, false)->b;
-	struct bch_fs_usage_base *src = &trans->fs_usage_delta;
-
-	s64 added = src->btree + src->data + src->reserved;
-
-	/*
-	 * Not allowed to reduce sectors_available except by getting a
-	 * reservation:
-	 */
-	s64 should_not_have_added = added - (s64) disk_res_sectors;
-	if (unlikely(should_not_have_added > 0)) {
-		u64 old, new, v = atomic64_read(&c->sectors_available);
-
-		do {
-			old = v;
-			new = max_t(s64, 0, old - should_not_have_added);
-		} while ((v = atomic64_cmpxchg(&c->sectors_available,
-					       old, new)) != old);
-
-		added -= should_not_have_added;
-		warn = true;
-	}
-
-	if (added > 0) {
-		trans->disk_res->sectors -= added;
-		this_cpu_sub(*c->online_reserved, added);
-	}
-
-	dst->hidden	+= src->hidden;
-	dst->btree	+= src->btree;
-	dst->data	+= src->data;
-	dst->cached	+= src->cached;
-	dst->reserved	+= src->reserved;
-	dst->nr_inodes	+= src->nr_inodes;
-
-	preempt_enable();
-	percpu_up_read(&c->mark_lock);
-
-	if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
-		bch2_trans_inconsistent(trans,
-					"disk usage increased %lli more than %llu sectors reserved)",
-					should_not_have_added, disk_res_sectors);
-}
-
-int bch2_trans_fs_usage_apply(struct btree_trans *trans,
-			      struct replicas_delta_list *deltas)
-{
-	struct bch_fs *c = trans->c;
-	struct replicas_delta *d, *d2;
-	struct replicas_delta *top = (void *) deltas->d + deltas->used;
-	struct bch_fs_usage *dst;
-	unsigned i;
-
-	percpu_down_read(&c->mark_lock);
-	preempt_disable();
-	dst = fs_usage_ptr(c, trans->journal_res.seq, false);
-
-	for (d = deltas->d; d != top; d = replicas_delta_next(d))
-		if (__update_replicas(c, dst, &d->r, d->delta))
-			goto need_mark;
-
-	dst->b.nr_inodes += deltas->nr_inodes;
-
-	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
-		dst->b.reserved			+= deltas->persistent_reserved[i];
-		dst->persistent_reserved[i]	+= deltas->persistent_reserved[i];
-	}
-
-	preempt_enable();
-	percpu_up_read(&c->mark_lock);
-	return 0;
-need_mark:
-	/* revert changes: */
-	for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2))
-		BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta));
-
-	preempt_enable();
-	percpu_up_read(&c->mark_lock);
-	return -1;
-}
-
-/* KEY_TYPE_extent: */
-
-static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca,
-			  struct bkey_s_c k,
-			  const struct bch_extent_ptr *ptr,
-			  s64 sectors, enum bch_data_type ptr_data_type,
-			  struct bch_alloc_v4 *a)
-{
-	u32 *dst_sectors = !ptr->cached
-		? &a->dirty_sectors
-		: &a->cached_sectors;
-	int ret = bch2_bucket_ref_update(trans, ca, k, ptr, sectors, ptr_data_type,
-					 a->gen, a->data_type, dst_sectors);
-
-	if (ret)
-		return ret;
-
-	alloc_data_type_set(a, ptr_data_type);
-	return 0;
-}
-
-static int bch2_trigger_pointer(struct btree_trans *trans,
-			enum btree_id btree_id, unsigned level,
-			struct bkey_s_c k, struct extent_ptr_decoded p,
-			const union bch_extent_entry *entry,
-			s64 *sectors,
-			enum btree_iter_update_trigger_flags flags)
-{
-	bool insert = !(flags & BTREE_TRIGGER_overwrite);
-	int ret = 0;
-
-	struct bch_fs *c = trans->c;
-	struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
-	if (unlikely(!ca)) {
-		if (insert)
-			ret = -EIO;
-		goto err;
-	}
-
-	struct bpos bucket;
-	struct bch_backpointer bp;
-	bch2_extent_ptr_to_bp(trans->c, ca, btree_id, level, k, p, entry, &bucket, &bp);
-	*sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len);
-
-	if (flags & BTREE_TRIGGER_transactional) {
-		struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket);
-		ret = PTR_ERR_OR_ZERO(a) ?:
-			__mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &a->v);
-		if (ret)
-			goto err;
-
-		if (!p.ptr.cached) {
-			ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, k, insert);
-			if (ret)
-				goto err;
-		}
-	}
-
-	if (flags & BTREE_TRIGGER_gc) {
-		percpu_down_read(&c->mark_lock);
-		struct bucket *g = gc_bucket(ca, bucket.offset);
-		bucket_lock(g);
-		struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
-		ret = __mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &new);
-		if (!ret) {
-			alloc_to_bucket(g, new);
-			bch2_dev_usage_update(c, ca, &old, &new, 0, true);
-		}
-		bucket_unlock(g);
-		percpu_up_read(&c->mark_lock);
-	}
-err:
-	bch2_dev_put(ca);
-	return ret;
-}
-
-static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
-				struct bkey_s_c k,
-				struct extent_ptr_decoded p,
-				enum bch_data_type data_type,
-				s64 sectors,
-				enum btree_iter_update_trigger_flags flags)
-{
-	if (flags & BTREE_TRIGGER_transactional) {
-		struct btree_iter iter;
-		struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter,
-				BTREE_ID_stripes, POS(0, p.ec.idx),
-				BTREE_ITER_with_updates, stripe);
-		int ret = PTR_ERR_OR_ZERO(s);
-		if (unlikely(ret)) {
-			bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans,
-				"pointer to nonexistent stripe %llu",
-				(u64) p.ec.idx);
-			goto err;
-		}
-
-		if (!bch2_ptr_matches_stripe(&s->v, p)) {
-			bch2_trans_inconsistent(trans,
-				"stripe pointer doesn't match stripe %llu",
-				(u64) p.ec.idx);
-			ret = -EIO;
-			goto err;
-		}
-
-		stripe_blockcount_set(&s->v, p.ec.block,
-			stripe_blockcount_get(&s->v, p.ec.block) +
-			sectors);
-
-		struct bch_replicas_padded r;
-		bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
-		r.e.data_type = data_type;
-		ret = bch2_update_replicas_list(trans, &r.e, sectors);
-err:
-		bch2_trans_iter_exit(trans, &iter);
-		return ret;
-	}
-
-	if (flags & BTREE_TRIGGER_gc) {
-		struct bch_fs *c = trans->c;
-
-		BUG_ON(!(flags & BTREE_TRIGGER_gc));
-
-		struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL);
-		if (!m) {
-			bch_err(c, "error allocating memory for gc_stripes, idx %llu",
-				(u64) p.ec.idx);
-			return -BCH_ERR_ENOMEM_mark_stripe_ptr;
-		}
-
-		mutex_lock(&c->ec_stripes_heap_lock);
-
-		if (!m || !m->alive) {
-			mutex_unlock(&c->ec_stripes_heap_lock);
-			struct printbuf buf = PRINTBUF;
-			bch2_bkey_val_to_text(&buf, c, k);
-			bch_err_ratelimited(c, "pointer to nonexistent stripe %llu\n  while marking %s",
-					    (u64) p.ec.idx, buf.buf);
-			printbuf_exit(&buf);
-			bch2_inconsistent_error(c);
-			return -EIO;
-		}
-
-		m->block_sectors[p.ec.block] += sectors;
-
-		struct bch_replicas_padded r = m->r;
-		mutex_unlock(&c->ec_stripes_heap_lock);
-
-		r.e.data_type = data_type;
-		bch2_update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true);
-	}
-
-	return 0;
-}
-
-static int __trigger_extent(struct btree_trans *trans,
-			    enum btree_id btree_id, unsigned level,
-			    struct bkey_s_c k,
-			    enum btree_iter_update_trigger_flags flags)
-{
-	bool gc = flags & BTREE_TRIGGER_gc;
-	struct bch_fs *c = trans->c;
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	struct bch_replicas_padded r;
-	enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
-		? BCH_DATA_btree
-		: BCH_DATA_user;
-	s64 replicas_sectors = 0;
-	int ret = 0;
-
-	r.e.data_type	= data_type;
-	r.e.nr_devs	= 0;
-	r.e.nr_required	= 1;
-
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		s64 disk_sectors;
-		ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags);
-		if (ret < 0)
-			return ret;
-
-		bool stale = ret > 0;
-
-		if (p.ptr.cached) {
-			if (!stale) {
-				ret = !gc
-					? bch2_update_cached_sectors_list(trans, p.ptr.dev, disk_sectors)
-					: update_cached_sectors(c, k, p.ptr.dev, disk_sectors, 0, true);
-				bch2_fs_fatal_err_on(ret && gc, c, "%s: no replicas entry while updating cached sectors",
-						     bch2_err_str(ret));
-				if (ret)
-					return ret;
-			}
-		} else if (!p.has_ec) {
-			replicas_sectors       += disk_sectors;
-			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
-		} else {
-			ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
-			if (ret)
-				return ret;
-
-			/*
-			 * There may be other dirty pointers in this extent, but
-			 * if so they're not required for mounting if we have an
-			 * erasure coded pointer in this extent:
-			 */
-			r.e.nr_required = 0;
-		}
-	}
-
-	if (r.e.nr_devs) {
-		ret = !gc
-			? bch2_update_replicas_list(trans, &r.e, replicas_sectors)
-			: bch2_update_replicas(c, k, &r.e, replicas_sectors, 0, true);
-		if (unlikely(ret && gc)) {
-			struct printbuf buf = PRINTBUF;
-
-			bch2_bkey_val_to_text(&buf, c, k);
-			bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf);
-			printbuf_exit(&buf);
-		}
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-int bch2_trigger_extent(struct btree_trans *trans,
-			enum btree_id btree, unsigned level,
-			struct bkey_s_c old, struct bkey_s new,
-			enum btree_iter_update_trigger_flags flags)
-{
-	struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c);
-	struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old);
-	unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start;
-	unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start;
-
-	if (unlikely(flags & BTREE_TRIGGER_check_repair))
-		return bch2_check_fix_ptrs(trans, btree, level, new.s_c, flags);
-
-	/* if pointers aren't changing - nothing to do: */
-	if (new_ptrs_bytes == old_ptrs_bytes &&
-	    !memcmp(new_ptrs.start,
-		    old_ptrs.start,
-		    new_ptrs_bytes))
-		return 0;
-
-	if (flags & BTREE_TRIGGER_transactional) {
-		struct bch_fs *c = trans->c;
-		int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) -
-			  (int) bch2_bkey_needs_rebalance(c, old);
-
-		if (mod) {
-			int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
-							      new.k->p, mod > 0);
-			if (ret)
-				return ret;
-		}
-	}
-
-	if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc))
-		return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree, level, old, new, flags);
-
-	return 0;
-}
-
-/* KEY_TYPE_reservation */
-
-static int __trigger_reservation(struct btree_trans *trans,
-			enum btree_id btree_id, unsigned level, struct bkey_s_c k,
-			enum btree_iter_update_trigger_flags flags)
-{
-	struct bch_fs *c = trans->c;
-	unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
-	s64 sectors = (s64) k.k->size * replicas;
-
-	if (flags & BTREE_TRIGGER_overwrite)
-		sectors = -sectors;
-
-	if (flags & BTREE_TRIGGER_transactional) {
-		int ret = bch2_replicas_deltas_realloc(trans, 0);
-		if (ret)
-			return ret;
-
-		struct replicas_delta_list *d = trans->fs_usage_deltas;
-		replicas = min(replicas, ARRAY_SIZE(d->persistent_reserved));
-
-		d->persistent_reserved[replicas - 1] += sectors;
-	}
-
-	if (flags & BTREE_TRIGGER_gc) {
-		percpu_down_read(&c->mark_lock);
-		preempt_disable();
-
-		struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage_gc);
-
-		replicas = min(replicas, ARRAY_SIZE(fs_usage->persistent_reserved));
-		fs_usage->b.reserved				+= sectors;
-		fs_usage->persistent_reserved[replicas - 1]	+= sectors;
-
-		preempt_enable();
-		percpu_up_read(&c->mark_lock);
-	}
-
-	return 0;
-}
-
-int bch2_trigger_reservation(struct btree_trans *trans,
-			  enum btree_id btree_id, unsigned level,
-			  struct bkey_s_c old, struct bkey_s new,
-			  enum btree_iter_update_trigger_flags flags)
-{
-	return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags);
-}
-
-/* Mark superblocks: */
-
-static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
-				    struct bch_dev *ca, u64 b,
-				    enum bch_data_type type,
-				    unsigned sectors)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	int ret = 0;
-
-	struct bkey_i_alloc_v4 *a =
-		bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(ca->dev_idx, b));
-	if (IS_ERR(a))
-		return PTR_ERR(a);
-
-	if (a->v.data_type && type && a->v.data_type != type) {
-		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			      BCH_FSCK_ERR_bucket_metadata_type_mismatch,
-			"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
-			"while marking %s",
-			iter.pos.inode, iter.pos.offset, a->v.gen,
-			bch2_data_type_str(a->v.data_type),
-			bch2_data_type_str(type),
-			bch2_data_type_str(type));
-		ret = -EIO;
-		goto err;
-	}
-
-	if (a->v.data_type	!= type ||
-	    a->v.dirty_sectors	!= sectors) {
-		a->v.data_type		= type;
-		a->v.dirty_sectors	= sectors;
-		ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
-	}
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
-			u64 b, enum bch_data_type data_type, unsigned sectors,
-			enum btree_iter_update_trigger_flags flags)
-{
-	int ret = 0;
-
-	percpu_down_read(&c->mark_lock);
-	struct bucket *g = gc_bucket(ca, b);
-
-	bucket_lock(g);
-	struct bch_alloc_v4 old = bucket_m_to_alloc(*g);
-
-	if (bch2_fs_inconsistent_on(g->data_type &&
-			g->data_type != data_type, c,
-			"different types of data in same bucket: %s, %s",
-			bch2_data_type_str(g->data_type),
-			bch2_data_type_str(data_type))) {
-		ret = -EIO;
-		goto err;
-	}
-
-	if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
-			"bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size",
-			ca->dev_idx, b, g->gen,
-			bch2_data_type_str(g->data_type ?: data_type),
-			g->dirty_sectors, sectors)) {
-		ret = -EIO;
-		goto err;
-	}
-
-	g->data_type = data_type;
-	g->dirty_sectors += sectors;
-	struct bch_alloc_v4 new = bucket_m_to_alloc(*g);
-err:
-	bucket_unlock(g);
-	if (!ret)
-		bch2_dev_usage_update(c, ca, &old, &new, 0, true);
-	percpu_up_read(&c->mark_lock);
-	return ret;
-}
-
-int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
-			struct bch_dev *ca, u64 b,
-			enum bch_data_type type, unsigned sectors,
-			enum btree_iter_update_trigger_flags flags)
-{
-	BUG_ON(type != BCH_DATA_free &&
-	       type != BCH_DATA_sb &&
-	       type != BCH_DATA_journal);
-
-	/*
-	 * Backup superblock might be past the end of our normal usable space:
-	 */
-	if (b >= ca->mi.nbuckets)
-		return 0;
-
-	if (flags & BTREE_TRIGGER_gc)
-		return bch2_mark_metadata_bucket(trans->c, ca, b, type, sectors, flags);
-	else if (flags & BTREE_TRIGGER_transactional)
-		return commit_do(trans, NULL, NULL, 0,
-				 __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
-	else
-		BUG();
-}
-
-static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
-			struct bch_dev *ca, u64 start, u64 end,
-			enum bch_data_type type, u64 *bucket, unsigned *bucket_sectors,
-			enum btree_iter_update_trigger_flags flags)
-{
-	do {
-		u64 b = sector_to_bucket(ca, start);
-		unsigned sectors =
-			min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
-
-		if (b != *bucket && *bucket_sectors) {
-			int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket,
-							type, *bucket_sectors, flags);
-			if (ret)
-				return ret;
-
-			*bucket_sectors = 0;
-		}
-
-		*bucket		= b;
-		*bucket_sectors	+= sectors;
-		start += sectors;
-	} while (start < end);
-
-	return 0;
-}
-
-static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *ca,
-			enum btree_iter_update_trigger_flags flags)
-{
-	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
-	u64 bucket = 0;
-	unsigned i, bucket_sectors = 0;
-	int ret;
-
-	for (i = 0; i < layout->nr_superblocks; i++) {
-		u64 offset = le64_to_cpu(layout->sb_offset[i]);
-
-		if (offset == BCH_SB_SECTOR) {
-			ret = bch2_trans_mark_metadata_sectors(trans, ca,
-						0, BCH_SB_SECTOR,
-						BCH_DATA_sb, &bucket, &bucket_sectors, flags);
-			if (ret)
-				return ret;
-		}
-
-		ret = bch2_trans_mark_metadata_sectors(trans, ca, offset,
-				      offset + (1 << layout->sb_max_size_bits),
-				      BCH_DATA_sb, &bucket, &bucket_sectors, flags);
-		if (ret)
-			return ret;
-	}
-
-	if (bucket_sectors) {
-		ret = bch2_trans_mark_metadata_bucket(trans, ca,
-				bucket, BCH_DATA_sb, bucket_sectors, flags);
-		if (ret)
-			return ret;
-	}
-
-	for (i = 0; i < ca->journal.nr; i++) {
-		ret = bch2_trans_mark_metadata_bucket(trans, ca,
-				ca->journal.buckets[i],
-				BCH_DATA_journal, ca->mi.bucket_size, flags);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca,
-			enum btree_iter_update_trigger_flags flags)
-{
-	int ret = bch2_trans_run(c,
-		__bch2_trans_mark_dev_sb(trans, ca, flags));
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c,
-			enum btree_iter_update_trigger_flags flags)
-{
-	for_each_online_member(c, ca) {
-		int ret = bch2_trans_mark_dev_sb(c, ca, flags);
-		if (ret) {
-			bch2_dev_put(ca);
-			return ret;
-		}
-	}
-
-	return 0;
-}
-
-int bch2_trans_mark_dev_sbs(struct bch_fs *c)
-{
-	return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional);
-}
-
-/* Disk reservations: */
-
-#define SECTORS_CACHE	1024
-
-int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
-			      u64 sectors, int flags)
-{
-	struct bch_fs_pcpu *pcpu;
-	u64 old, v, get;
-	s64 sectors_available;
-	int ret;
-
-	percpu_down_read(&c->mark_lock);
-	preempt_disable();
-	pcpu = this_cpu_ptr(c->pcpu);
-
-	if (sectors <= pcpu->sectors_available)
-		goto out;
-
-	v = atomic64_read(&c->sectors_available);
-	do {
-		old = v;
-		get = min((u64) sectors + SECTORS_CACHE, old);
-
-		if (get < sectors) {
-			preempt_enable();
-			goto recalculate;
-		}
-	} while ((v = atomic64_cmpxchg(&c->sectors_available,
-				       old, old - get)) != old);
-
-	pcpu->sectors_available		+= get;
-
-out:
-	pcpu->sectors_available		-= sectors;
-	this_cpu_add(*c->online_reserved, sectors);
-	res->sectors			+= sectors;
-
-	preempt_enable();
-	percpu_up_read(&c->mark_lock);
-	return 0;
-
-recalculate:
-	mutex_lock(&c->sectors_available_lock);
-
-	percpu_u64_set(&c->pcpu->sectors_available, 0);
-	sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
-
-	if (sectors <= sectors_available ||
-	    (flags & BCH_DISK_RESERVATION_NOFAIL)) {
-		atomic64_set(&c->sectors_available,
-			     max_t(s64, 0, sectors_available - sectors));
-		this_cpu_add(*c->online_reserved, sectors);
-		res->sectors			+= sectors;
-		ret = 0;
-	} else {
-		atomic64_set(&c->sectors_available, sectors_available);
-		ret = -BCH_ERR_ENOSPC_disk_reservation;
-	}
-
-	mutex_unlock(&c->sectors_available_lock);
-	percpu_up_read(&c->mark_lock);
-
-	return ret;
-}
-
-/* Startup/shutdown: */
-
-void bch2_buckets_nouse_free(struct bch_fs *c)
-{
-	for_each_member_device(c, ca) {
-		kvfree_rcu_mightsleep(ca->buckets_nouse);
-		ca->buckets_nouse = NULL;
-	}
-}
-
-int bch2_buckets_nouse_alloc(struct bch_fs *c)
-{
-	for_each_member_device(c, ca) {
-		BUG_ON(ca->buckets_nouse);
-
-		ca->buckets_nouse = kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) *
-					    sizeof(unsigned long),
-					    GFP_KERNEL|__GFP_ZERO);
-		if (!ca->buckets_nouse) {
-			bch2_dev_put(ca);
-			return -BCH_ERR_ENOMEM_buckets_nouse;
-		}
-	}
-
-	return 0;
-}
-
-static void bucket_gens_free_rcu(struct rcu_head *rcu)
-{
-	struct bucket_gens *buckets =
-		container_of(rcu, struct bucket_gens, rcu);
-
-	kvfree(buckets);
-}
-
-int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
-{
-	struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
-	bool resize = ca->bucket_gens != NULL;
-	int ret;
-
-	BUG_ON(resize && ca->buckets_nouse);
-
-	if (!(bucket_gens	= kvmalloc(sizeof(struct bucket_gens) + nbuckets,
-					   GFP_KERNEL|__GFP_ZERO))) {
-		ret = -BCH_ERR_ENOMEM_bucket_gens;
-		goto err;
-	}
-
-	bucket_gens->first_bucket = ca->mi.first_bucket;
-	bucket_gens->nbuckets	= nbuckets;
-
-	if (resize) {
-		down_write(&c->gc_lock);
-		down_write(&ca->bucket_lock);
-		percpu_down_write(&c->mark_lock);
-	}
-
-	old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
-
-	if (resize) {
-		size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets);
-
-		memcpy(bucket_gens->b,
-		       old_bucket_gens->b,
-		       n);
-	}
-
-	rcu_assign_pointer(ca->bucket_gens, bucket_gens);
-	bucket_gens	= old_bucket_gens;
-
-	nbuckets = ca->mi.nbuckets;
-
-	if (resize) {
-		percpu_up_write(&c->mark_lock);
-		up_write(&ca->bucket_lock);
-		up_write(&c->gc_lock);
-	}
-
-	ret = 0;
-err:
-	if (bucket_gens)
-		call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
-
-	return ret;
-}
-
-void bch2_dev_buckets_free(struct bch_dev *ca)
-{
-	kvfree(ca->buckets_nouse);
-	kvfree(rcu_dereference_protected(ca->bucket_gens, 1));
-
-	for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++)
-		free_percpu(ca->usage[i]);
-	kfree(ca->usage_base);
-}
-
-int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
-{
-	ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
-	if (!ca->usage_base)
-		return -BCH_ERR_ENOMEM_usage_init;
-
-	for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) {
-		ca->usage[i] = alloc_percpu(struct bch_dev_usage);
-		if (!ca->usage[i])
-			return -BCH_ERR_ENOMEM_usage_init;
-	}
-
-	return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
-}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
deleted file mode 100644
index 617ffde2fb7a..000000000000
--- a/fs/bcachefs/buckets.h
+++ /dev/null
@@ -1,475 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Code for manipulating bucket marks for garbage collection.
- *
- * Copyright 2014 Datera, Inc.
- */
-
-#ifndef _BUCKETS_H
-#define _BUCKETS_H
-
-#include "buckets_types.h"
-#include "extents.h"
-#include "sb-members.h"
-
-static inline u64 sector_to_bucket(const struct bch_dev *ca, sector_t s)
-{
-	return div_u64(s, ca->mi.bucket_size);
-}
-
-static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
-{
-	return ((sector_t) b) * ca->mi.bucket_size;
-}
-
-static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
-{
-	u32 remainder;
-
-	div_u64_rem(s, ca->mi.bucket_size, &remainder);
-	return remainder;
-}
-
-static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, u32 *offset)
-{
-	return div_u64_rem(s, ca->mi.bucket_size, offset);
-}
-
-#define for_each_bucket(_b, _buckets)				\
-	for (_b = (_buckets)->b + (_buckets)->first_bucket;	\
-	     _b < (_buckets)->b + (_buckets)->nbuckets; _b++)
-
-/*
- * Ugly hack alert:
- *
- * We need to cram a spinlock in a single byte, because that's what we have left
- * in struct bucket, and we care about the size of these - during fsck, we need
- * in memory state for every single bucket on every device.
- *
- * We used to do
- *   while (xchg(&b->lock, 1) cpu_relax();
- * but, it turns out not all architectures support xchg on a single byte.
- *
- * So now we use bit_spin_lock(), with fun games since we can't burn a whole
- * ulong for this - we just need to make sure the lock bit always ends up in the
- * first byte.
- */
-
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#define BUCKET_LOCK_BITNR	0
-#else
-#define BUCKET_LOCK_BITNR	(BITS_PER_LONG - 1)
-#endif
-
-union ulong_byte_assert {
-	ulong	ulong;
-	u8	byte;
-};
-
-static inline void bucket_unlock(struct bucket *b)
-{
-	BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
-
-	clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock);
-	wake_up_bit((void *) &b->lock, BUCKET_LOCK_BITNR);
-}
-
-static inline void bucket_lock(struct bucket *b)
-{
-	wait_on_bit_lock((void *) &b->lock, BUCKET_LOCK_BITNR,
-			 TASK_UNINTERRUPTIBLE);
-}
-
-static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
-{
-	return rcu_dereference_check(ca->buckets_gc,
-				     !ca->fs ||
-				     percpu_rwsem_is_held(&ca->fs->mark_lock) ||
-				     lockdep_is_held(&ca->fs->gc_lock) ||
-				     lockdep_is_held(&ca->bucket_lock));
-}
-
-static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
-{
-	struct bucket_array *buckets = gc_bucket_array(ca);
-
-	BUG_ON(!bucket_valid(ca, b));
-	return buckets->b + b;
-}
-
-static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
-{
-	return rcu_dereference_check(ca->bucket_gens,
-				     !ca->fs ||
-				     percpu_rwsem_is_held(&ca->fs->mark_lock) ||
-				     lockdep_is_held(&ca->fs->gc_lock) ||
-				     lockdep_is_held(&ca->bucket_lock));
-}
-
-static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
-{
-	struct bucket_gens *gens = bucket_gens(ca);
-
-	BUG_ON(!bucket_valid(ca, b));
-	return gens->b + b;
-}
-
-static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
-				   const struct bch_extent_ptr *ptr)
-{
-	return sector_to_bucket(ca, ptr->offset);
-}
-
-static inline struct bpos PTR_BUCKET_POS(const struct bch_dev *ca,
-					 const struct bch_extent_ptr *ptr)
-{
-	return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
-}
-
-static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_dev *ca,
-						const struct bch_extent_ptr *ptr,
-						u32 *bucket_offset)
-{
-	return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset));
-}
-
-static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
-					   const struct bch_extent_ptr *ptr)
-{
-	return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr));
-}
-
-static inline enum bch_data_type ptr_data_type(const struct bkey *k,
-					       const struct bch_extent_ptr *ptr)
-{
-	if (bkey_is_btree_ptr(k))
-		return BCH_DATA_btree;
-
-	return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
-}
-
-static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
-{
-	EBUG_ON(sectors < 0);
-
-	return crc_is_compressed(p.crc)
-		? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
-				   p.crc.uncompressed_size)
-		: sectors;
-}
-
-static inline int gen_cmp(u8 a, u8 b)
-{
-	return (s8) (a - b);
-}
-
-static inline int gen_after(u8 a, u8 b)
-{
-	int r = gen_cmp(a, b);
-
-	return r > 0 ? r : 0;
-}
-
-static inline u8 dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
-{
-	return gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen);
-}
-
-/**
- * dev_ptr_stale() - check if a pointer points into a bucket that has been
- * invalidated.
- */
-static inline u8 dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
-{
-	rcu_read_lock();
-	u8 ret = dev_ptr_stale_rcu(ca, ptr);
-	rcu_read_unlock();
-
-	return ret;
-}
-
-/* Device usage: */
-
-void bch2_dev_usage_read_fast(struct bch_dev *, struct bch_dev_usage *);
-static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
-{
-	struct bch_dev_usage ret;
-
-	bch2_dev_usage_read_fast(ca, &ret);
-	return ret;
-}
-
-void bch2_dev_usage_init(struct bch_dev *);
-void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev_usage *);
-
-static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
-{
-	s64 reserved = 0;
-
-	switch (watermark) {
-	case BCH_WATERMARK_NR:
-		BUG();
-	case BCH_WATERMARK_stripe:
-		reserved += ca->mi.nbuckets >> 6;
-		fallthrough;
-	case BCH_WATERMARK_normal:
-		reserved += ca->mi.nbuckets >> 6;
-		fallthrough;
-	case BCH_WATERMARK_copygc:
-		reserved += ca->nr_btree_reserve;
-		fallthrough;
-	case BCH_WATERMARK_btree:
-		reserved += ca->nr_btree_reserve;
-		fallthrough;
-	case BCH_WATERMARK_btree_copygc:
-	case BCH_WATERMARK_reclaim:
-	case BCH_WATERMARK_interior_updates:
-		break;
-	}
-
-	return reserved;
-}
-
-static inline u64 dev_buckets_free(struct bch_dev *ca,
-				   struct bch_dev_usage usage,
-				   enum bch_watermark watermark)
-{
-	return max_t(s64, 0,
-		     usage.d[BCH_DATA_free].buckets -
-		     ca->nr_open_buckets -
-		     bch2_dev_buckets_reserved(ca, watermark));
-}
-
-static inline u64 __dev_buckets_available(struct bch_dev *ca,
-					  struct bch_dev_usage usage,
-					  enum bch_watermark watermark)
-{
-	return max_t(s64, 0,
-		       usage.d[BCH_DATA_free].buckets
-		     + usage.d[BCH_DATA_cached].buckets
-		     + usage.d[BCH_DATA_need_gc_gens].buckets
-		     + usage.d[BCH_DATA_need_discard].buckets
-		     - ca->nr_open_buckets
-		     - bch2_dev_buckets_reserved(ca, watermark));
-}
-
-static inline u64 dev_buckets_available(struct bch_dev *ca,
-					enum bch_watermark watermark)
-{
-	return __dev_buckets_available(ca, bch2_dev_usage_read(ca), watermark);
-}
-
-/* Filesystem usage: */
-
-static inline unsigned __fs_usage_u64s(unsigned nr_replicas)
-{
-	return sizeof(struct bch_fs_usage) / sizeof(u64) + nr_replicas;
-}
-
-static inline unsigned fs_usage_u64s(struct bch_fs *c)
-{
-	return __fs_usage_u64s(READ_ONCE(c->replicas.nr));
-}
-
-static inline unsigned __fs_usage_online_u64s(unsigned nr_replicas)
-{
-	return sizeof(struct bch_fs_usage_online) / sizeof(u64) + nr_replicas;
-}
-
-static inline unsigned fs_usage_online_u64s(struct bch_fs *c)
-{
-	return __fs_usage_online_u64s(READ_ONCE(c->replicas.nr));
-}
-
-static inline unsigned dev_usage_u64s(void)
-{
-	return sizeof(struct bch_dev_usage) / sizeof(u64);
-}
-
-u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *);
-
-struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *);
-
-void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned);
-
-void bch2_fs_usage_to_text(struct printbuf *,
-			   struct bch_fs *, struct bch_fs_usage_online *);
-
-u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *);
-
-struct bch_fs_usage_short
-bch2_fs_usage_read_short(struct bch_fs *);
-
-void bch2_dev_usage_update(struct bch_fs *, struct bch_dev *,
-			   const struct bch_alloc_v4 *,
-			   const struct bch_alloc_v4 *, u64, bool);
-
-/* key/bucket marking: */
-
-static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
-						unsigned journal_seq,
-						bool gc)
-{
-	percpu_rwsem_assert_held(&c->mark_lock);
-	BUG_ON(!gc && !journal_seq);
-
-	return this_cpu_ptr(gc
-			    ? c->usage_gc
-			    : c->usage[journal_seq & JOURNAL_BUF_MASK]);
-}
-
-int bch2_update_replicas(struct bch_fs *, struct bkey_s_c,
-			 struct bch_replicas_entry_v1 *, s64,
-			 unsigned, bool);
-int bch2_update_replicas_list(struct btree_trans *,
-			 struct bch_replicas_entry_v1 *, s64);
-int bch2_update_cached_sectors_list(struct btree_trans *, unsigned, s64);
-int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned);
-
-void bch2_fs_usage_initialize(struct bch_fs *);
-
-int bch2_bucket_ref_update(struct btree_trans *, struct bch_dev *,
-			   struct bkey_s_c, const struct bch_extent_ptr *,
-			   s64, enum bch_data_type, u8, u8, u32 *);
-
-int bch2_check_fix_ptrs(struct btree_trans *,
-			enum btree_id, unsigned, struct bkey_s_c,
-			enum btree_iter_update_trigger_flags);
-
-int bch2_trigger_extent(struct btree_trans *, enum btree_id, unsigned,
-			struct bkey_s_c, struct bkey_s,
-			enum btree_iter_update_trigger_flags);
-int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned,
-			  struct bkey_s_c, struct bkey_s,
-			  enum btree_iter_update_trigger_flags);
-
-#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
-({												\
-	int ret = 0;										\
-												\
-	if (_old.k->type)									\
-		ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_insert);	\
-	if (!ret && _new.k->type)								\
-		ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_overwrite);\
-	ret;											\
-})
-
-void bch2_trans_account_disk_usage_change(struct btree_trans *);
-
-void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
-int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
-
-int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, u64,
-				    enum bch_data_type, unsigned,
-				    enum btree_iter_update_trigger_flags);
-int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *,
-				    enum btree_iter_update_trigger_flags);
-int bch2_trans_mark_dev_sbs_flags(struct bch_fs *,
-				    enum btree_iter_update_trigger_flags);
-int bch2_trans_mark_dev_sbs(struct bch_fs *);
-
-static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
-{
-	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
-	u64 b_offset	= bucket_to_sector(ca, b);
-	u64 b_end	= bucket_to_sector(ca, b + 1);
-	unsigned i;
-
-	if (!b)
-		return true;
-
-	for (i = 0; i < layout->nr_superblocks; i++) {
-		u64 offset = le64_to_cpu(layout->sb_offset[i]);
-		u64 end = offset + (1 << layout->sb_max_size_bits);
-
-		if (!(offset >= b_end || end <= b_offset))
-			return true;
-	}
-
-	return false;
-}
-
-static inline const char *bch2_data_type_str(enum bch_data_type type)
-{
-	return type < BCH_DATA_NR
-		? __bch2_data_types[type]
-		: "(invalid data type)";
-}
-
-/* disk reservations: */
-
-static inline void bch2_disk_reservation_put(struct bch_fs *c,
-					     struct disk_reservation *res)
-{
-	if (res->sectors) {
-		this_cpu_sub(*c->online_reserved, res->sectors);
-		res->sectors = 0;
-	}
-}
-
-#define BCH_DISK_RESERVATION_NOFAIL		(1 << 0)
-
-int __bch2_disk_reservation_add(struct bch_fs *,
-				struct disk_reservation *,
-				u64, int);
-
-static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
-					    u64 sectors, int flags)
-{
-#ifdef __KERNEL__
-	u64 old, new;
-
-	do {
-		old = this_cpu_read(c->pcpu->sectors_available);
-		if (sectors > old)
-			return __bch2_disk_reservation_add(c, res, sectors, flags);
-
-		new = old - sectors;
-	} while (this_cpu_cmpxchg(c->pcpu->sectors_available, old, new) != old);
-
-	this_cpu_add(*c->online_reserved, sectors);
-	res->sectors			+= sectors;
-	return 0;
-#else
-	return __bch2_disk_reservation_add(c, res, sectors, flags);
-#endif
-}
-
-static inline struct disk_reservation
-bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
-{
-	return (struct disk_reservation) {
-		.sectors	= 0,
-#if 0
-		/* not used yet: */
-		.gen		= c->capacity_gen,
-#endif
-		.nr_replicas	= nr_replicas,
-	};
-}
-
-static inline int bch2_disk_reservation_get(struct bch_fs *c,
-					    struct disk_reservation *res,
-					    u64 sectors, unsigned nr_replicas,
-					    int flags)
-{
-	*res = bch2_disk_reservation_init(c, nr_replicas);
-
-	return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags);
-}
-
-#define RESERVE_FACTOR	6
-
-static inline u64 avail_factor(u64 r)
-{
-	return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1);
-}
-
-void bch2_buckets_nouse_free(struct bch_fs *);
-int bch2_buckets_nouse_alloc(struct bch_fs *);
-
-int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64);
-void bch2_dev_buckets_free(struct bch_dev *);
-int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *);
-
-#endif /* _BUCKETS_H */
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
deleted file mode 100644
index 6a31740222a7..000000000000
--- a/fs/bcachefs/buckets_types.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BUCKETS_TYPES_H
-#define _BUCKETS_TYPES_H
-
-#include "bcachefs_format.h"
-#include "util.h"
-
-#define BUCKET_JOURNAL_SEQ_BITS		16
-
-struct bucket {
-	u8			lock;
-	u8			gen_valid:1;
-	u8			data_type:7;
-	u8			gen;
-	u8			stripe_redundancy;
-	u32			stripe;
-	u32			dirty_sectors;
-	u32			cached_sectors;
-};
-
-struct bucket_array {
-	struct rcu_head		rcu;
-	u16			first_bucket;
-	size_t			nbuckets;
-	struct bucket		b[];
-};
-
-struct bucket_gens {
-	struct rcu_head		rcu;
-	u16			first_bucket;
-	size_t			nbuckets;
-	u8			b[];
-};
-
-struct bch_dev_usage {
-	struct {
-		u64		buckets;
-		u64		sectors; /* _compressed_ sectors: */
-		/*
-		 * XXX
-		 * Why do we have this? Isn't it just buckets * bucket_size -
-		 * sectors?
-		 */
-		u64		fragmented;
-	}			d[BCH_DATA_NR];
-};
-
-struct bch_fs_usage_base {
-	u64			hidden;
-	u64			btree;
-	u64			data;
-	u64			cached;
-	u64			reserved;
-	u64			nr_inodes;
-};
-
-struct bch_fs_usage {
-	/* all fields are in units of 512 byte sectors: */
-	struct bch_fs_usage_base b;
-	u64			persistent_reserved[BCH_REPLICAS_MAX];
-	u64			replicas[];
-};
-
-struct bch_fs_usage_online {
-	u64			online_reserved;
-	struct bch_fs_usage	u;
-};
-
-struct bch_fs_usage_short {
-	u64			capacity;
-	u64			used;
-	u64			free;
-	u64			nr_inodes;
-};
-
-/*
- * A reservation for space on disk:
- */
-struct disk_reservation {
-	u64			sectors;
-	u32			gen;
-	unsigned		nr_replicas;
-};
-
-#endif /* _BUCKETS_TYPES_H */
diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
deleted file mode 100644
index ec1b636ef78d..000000000000
--- a/fs/bcachefs/buckets_waiting_for_journal.c
+++ /dev/null
@@ -1,166 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "buckets_waiting_for_journal.h"
-#include <linux/hash.h>
-#include <linux/random.h>
-
-static inline struct bucket_hashed *
-bucket_hash(struct buckets_waiting_for_journal_table *t,
-	    unsigned hash_seed_idx, u64 dev_bucket)
-{
-	return t->d + hash_64(dev_bucket ^ t->hash_seeds[hash_seed_idx], t->bits);
-}
-
-static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t bits)
-{
-	unsigned i;
-
-	t->bits = bits;
-	for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++)
-		get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i]));
-	memset(t->d, 0, sizeof(t->d[0]) << t->bits);
-}
-
-bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
-				      u64 flushed_seq,
-				      unsigned dev, u64 bucket)
-{
-	struct buckets_waiting_for_journal_table *t;
-	u64 dev_bucket = (u64) dev << 56 | bucket;
-	bool ret = false;
-	unsigned i;
-
-	mutex_lock(&b->lock);
-	t = b->t;
-
-	for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
-		struct bucket_hashed *h = bucket_hash(t, i, dev_bucket);
-
-		if (h->dev_bucket == dev_bucket) {
-			ret = h->journal_seq > flushed_seq;
-			break;
-		}
-	}
-
-	mutex_unlock(&b->lock);
-
-	return ret;
-}
-
-static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t,
-				struct bucket_hashed *new,
-				u64 flushed_seq)
-{
-	struct bucket_hashed *last_evicted = NULL;
-	unsigned tries, i;
-
-	for (tries = 0; tries < 10; tries++) {
-		struct bucket_hashed *old, *victim = NULL;
-
-		for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
-			old = bucket_hash(t, i, new->dev_bucket);
-
-			if (old->dev_bucket == new->dev_bucket ||
-			    old->journal_seq <= flushed_seq) {
-				*old = *new;
-				return true;
-			}
-
-			if (last_evicted != old)
-				victim = old;
-		}
-
-		/* hashed to same slot 3 times: */
-		if (!victim)
-			break;
-
-		/* Failed to find an empty slot: */
-		swap(*new, *victim);
-		last_evicted = victim;
-	}
-
-	return false;
-}
-
-int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
-					 u64 flushed_seq,
-					 unsigned dev, u64 bucket,
-					 u64 journal_seq)
-{
-	struct buckets_waiting_for_journal_table *t, *n;
-	struct bucket_hashed tmp, new = {
-		.dev_bucket	= (u64) dev << 56 | bucket,
-		.journal_seq	= journal_seq,
-	};
-	size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0;
-	int ret = 0;
-
-	mutex_lock(&b->lock);
-
-	if (likely(bucket_table_insert(b->t, &new, flushed_seq)))
-		goto out;
-
-	t = b->t;
-	size = 1UL << t->bits;
-	for (i = 0; i < size; i++)
-		nr_elements += t->d[i].journal_seq > flushed_seq;
-
-	new_bits = t->bits + (nr_elements * 3 > size);
-
-	n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL);
-	if (!n) {
-		ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set;
-		goto out;
-	}
-
-retry_rehash:
-	nr_rehashes++;
-	bucket_table_init(n, new_bits);
-
-	tmp = new;
-	BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq));
-
-	for (i = 0; i < 1UL << t->bits; i++) {
-		if (t->d[i].journal_seq <= flushed_seq)
-			continue;
-
-		tmp = t->d[i];
-		if (!bucket_table_insert(n, &tmp, flushed_seq))
-			goto retry_rehash;
-	}
-
-	b->t = n;
-	kvfree(t);
-
-	pr_debug("took %zu rehashes, table at %zu/%lu elements",
-		 nr_rehashes, nr_elements, 1UL << b->t->bits);
-out:
-	mutex_unlock(&b->lock);
-
-	return ret;
-}
-
-void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c)
-{
-	struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
-
-	kvfree(b->t);
-}
-
-#define INITIAL_TABLE_BITS		3
-
-int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c)
-{
-	struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
-
-	mutex_init(&b->lock);
-
-	b->t = kvmalloc(sizeof(*b->t) +
-			(sizeof(b->t->d[0]) << INITIAL_TABLE_BITS), GFP_KERNEL);
-	if (!b->t)
-		return -BCH_ERR_ENOMEM_buckets_waiting_for_journal_init;
-
-	bucket_table_init(b->t, INITIAL_TABLE_BITS);
-	return 0;
-}
diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h
deleted file mode 100644
index d2ae19cbe18c..000000000000
--- a/fs/bcachefs/buckets_waiting_for_journal.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H
-#define _BUCKETS_WAITING_FOR_JOURNAL_H
-
-#include "buckets_waiting_for_journal_types.h"
-
-bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
-				      u64, unsigned, u64);
-int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
-					 u64, unsigned, u64, u64);
-
-void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *);
-int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *);
-
-#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */
diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h
deleted file mode 100644
index e593db061d81..000000000000
--- a/fs/bcachefs/buckets_waiting_for_journal_types.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
-#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
-
-#include <linux/siphash.h>
-
-struct bucket_hashed {
-	u64			dev_bucket;
-	u64			journal_seq;
-};
-
-struct buckets_waiting_for_journal_table {
-	unsigned		bits;
-	u64			hash_seeds[3];
-	struct bucket_hashed	d[];
-};
-
-struct buckets_waiting_for_journal {
-	struct mutex		lock;
-	struct buckets_waiting_for_journal_table *t;
-};
-
-#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
deleted file mode 100644
index 9e54323f0f5f..000000000000
--- a/fs/bcachefs/chardev.c
+++ /dev/null
@@ -1,1022 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_CHARDEV
-
-#include "bcachefs.h"
-#include "bcachefs_ioctl.h"
-#include "buckets.h"
-#include "chardev.h"
-#include "journal.h"
-#include "move.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "super.h"
-#include "super-io.h"
-#include "thread_with_file.h"
-
-#include <linux/cdev.h>
-#include <linux/device.h>
-#include <linux/fs.h>
-#include <linux/ioctl.h>
-#include <linux/major.h>
-#include <linux/sched/task.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-
-/* returns with ref on ca->ref */
-static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
-					  unsigned flags)
-{
-	struct bch_dev *ca;
-
-	if (flags & BCH_BY_INDEX) {
-		if (dev >= c->sb.nr_devices)
-			return ERR_PTR(-EINVAL);
-
-		ca = bch2_dev_tryget_noerror(c, dev);
-		if (!ca)
-			return ERR_PTR(-EINVAL);
-	} else {
-		char *path;
-
-		path = strndup_user((const char __user *)
-				    (unsigned long) dev, PATH_MAX);
-		if (IS_ERR(path))
-			return ERR_CAST(path);
-
-		ca = bch2_dev_lookup(c, path);
-		kfree(path);
-	}
-
-	return ca;
-}
-
-#if 0
-static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
-{
-	struct bch_ioctl_assemble arg;
-	struct bch_fs *c;
-	u64 *user_devs = NULL;
-	char **devs = NULL;
-	unsigned i;
-	int ret = -EFAULT;
-
-	if (copy_from_user(&arg, user_arg, sizeof(arg)))
-		return -EFAULT;
-
-	if (arg.flags || arg.pad)
-		return -EINVAL;
-
-	user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL);
-	if (!user_devs)
-		return -ENOMEM;
-
-	devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
-
-	if (copy_from_user(user_devs, user_arg->devs,
-			   sizeof(u64) * arg.nr_devs))
-		goto err;
-
-	for (i = 0; i < arg.nr_devs; i++) {
-		devs[i] = strndup_user((const char __user *)(unsigned long)
-				       user_devs[i],
-				       PATH_MAX);
-		ret= PTR_ERR_OR_ZERO(devs[i]);
-		if (ret)
-			goto err;
-	}
-
-	c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty());
-	ret = PTR_ERR_OR_ZERO(c);
-	if (!ret)
-		closure_put(&c->cl);
-err:
-	if (devs)
-		for (i = 0; i < arg.nr_devs; i++)
-			kfree(devs[i]);
-	kfree(devs);
-	return ret;
-}
-
-static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg)
-{
-	struct bch_ioctl_incremental arg;
-	const char *err;
-	char *path;
-
-	if (copy_from_user(&arg, user_arg, sizeof(arg)))
-		return -EFAULT;
-
-	if (arg.flags || arg.pad)
-		return -EINVAL;
-
-	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
-	ret = PTR_ERR_OR_ZERO(path);
-	if (ret)
-		return ret;
-
-	err = bch2_fs_open_incremental(path);
-	kfree(path);
-
-	if (err) {
-		pr_err("Could not register bcachefs devices: %s", err);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-#endif
-
-struct fsck_thread {
-	struct thread_with_stdio thr;
-	struct bch_fs		*c;
-	struct bch_opts		opts;
-};
-
-static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
-{
-	struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
-	kfree(thr);
-}
-
-static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
-{
-	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
-	struct bch_fs *c = thr->c;
-
-	int ret = PTR_ERR_OR_ZERO(c);
-	if (ret)
-		return ret;
-
-	ret = bch2_fs_start(thr->c);
-	if (ret)
-		goto err;
-
-	if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
-		bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
-		ret |= 1;
-	}
-	if (test_bit(BCH_FS_error, &c->flags)) {
-		bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
-		ret |= 4;
-	}
-err:
-	bch2_fs_stop(c);
-	return ret;
-}
-
-static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
-	.exit		= bch2_fsck_thread_exit,
-	.fn		= bch2_fsck_offline_thread_fn,
-};
-
-static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
-{
-	struct bch_ioctl_fsck_offline arg;
-	struct fsck_thread *thr = NULL;
-	darray_str(devs) = {};
-	long ret = 0;
-
-	if (copy_from_user(&arg, user_arg, sizeof(arg)))
-		return -EFAULT;
-
-	if (arg.flags)
-		return -EINVAL;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	for (size_t i = 0; i < arg.nr_devs; i++) {
-		u64 dev_u64;
-		ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
-		if (ret)
-			goto err;
-
-		char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
-		ret = PTR_ERR_OR_ZERO(dev_str);
-		if (ret)
-			goto err;
-
-		ret = darray_push(&devs, dev_str);
-		if (ret) {
-			kfree(dev_str);
-			goto err;
-		}
-	}
-
-	thr = kzalloc(sizeof(*thr), GFP_KERNEL);
-	if (!thr) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	thr->opts = bch2_opts_empty();
-
-	if (arg.opts) {
-		char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
-
-		ret =   PTR_ERR_OR_ZERO(optstr) ?:
-			bch2_parse_mount_opts(NULL, &thr->opts, optstr);
-		kfree(optstr);
-
-		if (ret)
-			goto err;
-	}
-
-	opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
-
-	/* We need request_key() to be called before we punt to kthread: */
-	opt_set(thr->opts, nostart, true);
-
-	bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
-
-	thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
-
-	if (!IS_ERR(thr->c) &&
-	    thr->c->opts.errors == BCH_ON_ERROR_panic)
-		thr->c->opts.errors = BCH_ON_ERROR_ro;
-
-	ret = __bch2_run_thread_with_stdio(&thr->thr);
-out:
-	darray_for_each(devs, i)
-		kfree(*i);
-	darray_exit(&devs);
-	return ret;
-err:
-	if (thr)
-		bch2_fsck_thread_exit(&thr->thr);
-	pr_err("ret %s", bch2_err_str(ret));
-	goto out;
-}
-
-static long bch2_global_ioctl(unsigned cmd, void __user *arg)
-{
-	long ret;
-
-	switch (cmd) {
-#if 0
-	case BCH_IOCTL_ASSEMBLE:
-		return bch2_ioctl_assemble(arg);
-	case BCH_IOCTL_INCREMENTAL:
-		return bch2_ioctl_incremental(arg);
-#endif
-	case BCH_IOCTL_FSCK_OFFLINE: {
-		ret = bch2_ioctl_fsck_offline(arg);
-		break;
-	}
-	default:
-		ret = -ENOTTY;
-		break;
-	}
-
-	if (ret < 0)
-		ret = bch2_err_class(ret);
-	return ret;
-}
-
-static long bch2_ioctl_query_uuid(struct bch_fs *c,
-			struct bch_ioctl_query_uuid __user *user_arg)
-{
-	return copy_to_user_errcode(&user_arg->uuid, &c->sb.user_uuid,
-				    sizeof(c->sb.user_uuid));
-}
-
-#if 0
-static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
-{
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if (arg.flags || arg.pad)
-		return -EINVAL;
-
-	return bch2_fs_start(c);
-}
-
-static long bch2_ioctl_stop(struct bch_fs *c)
-{
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	bch2_fs_stop(c);
-	return 0;
-}
-#endif
-
-static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
-{
-	char *path;
-	int ret;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if (arg.flags || arg.pad)
-		return -EINVAL;
-
-	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
-	ret = PTR_ERR_OR_ZERO(path);
-	if (ret)
-		return ret;
-
-	ret = bch2_dev_add(c, path);
-	kfree(path);
-
-	return ret;
-}
-
-static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg)
-{
-	struct bch_dev *ca;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
-			   BCH_FORCE_IF_METADATA_LOST|
-			   BCH_FORCE_IF_DEGRADED|
-			   BCH_BY_INDEX)) ||
-	    arg.pad)
-		return -EINVAL;
-
-	ca = bch2_device_lookup(c, arg.dev, arg.flags);
-	if (IS_ERR(ca))
-		return PTR_ERR(ca);
-
-	return bch2_dev_remove(c, ca, arg.flags);
-}
-
-static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
-{
-	char *path;
-	int ret;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if (arg.flags || arg.pad)
-		return -EINVAL;
-
-	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
-	ret = PTR_ERR_OR_ZERO(path);
-	if (ret)
-		return ret;
-
-	ret = bch2_dev_online(c, path);
-	kfree(path);
-	return ret;
-}
-
-static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
-{
-	struct bch_dev *ca;
-	int ret;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
-			   BCH_FORCE_IF_METADATA_LOST|
-			   BCH_FORCE_IF_DEGRADED|
-			   BCH_BY_INDEX)) ||
-	    arg.pad)
-		return -EINVAL;
-
-	ca = bch2_device_lookup(c, arg.dev, arg.flags);
-	if (IS_ERR(ca))
-		return PTR_ERR(ca);
-
-	ret = bch2_dev_offline(c, ca, arg.flags);
-	bch2_dev_put(ca);
-	return ret;
-}
-
-static long bch2_ioctl_disk_set_state(struct bch_fs *c,
-			struct bch_ioctl_disk_set_state arg)
-{
-	struct bch_dev *ca;
-	int ret;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
-			   BCH_FORCE_IF_METADATA_LOST|
-			   BCH_FORCE_IF_DEGRADED|
-			   BCH_BY_INDEX)) ||
-	    arg.pad[0] || arg.pad[1] || arg.pad[2] ||
-	    arg.new_state >= BCH_MEMBER_STATE_NR)
-		return -EINVAL;
-
-	ca = bch2_device_lookup(c, arg.dev, arg.flags);
-	if (IS_ERR(ca))
-		return PTR_ERR(ca);
-
-	ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags);
-	if (ret)
-		bch_err(c, "Error setting device state: %s", bch2_err_str(ret));
-
-	bch2_dev_put(ca);
-	return ret;
-}
-
-struct bch_data_ctx {
-	struct thread_with_file		thr;
-
-	struct bch_fs			*c;
-	struct bch_ioctl_data		arg;
-	struct bch_move_stats		stats;
-};
-
-static int bch2_data_thread(void *arg)
-{
-	struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr);
-
-	ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
-	ctx->stats.data_type = U8_MAX;
-	return 0;
-}
-
-static int bch2_data_job_release(struct inode *inode, struct file *file)
-{
-	struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
-
-	bch2_thread_with_file_exit(&ctx->thr);
-	kfree(ctx);
-	return 0;
-}
-
-static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
-				  size_t len, loff_t *ppos)
-{
-	struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
-	struct bch_fs *c = ctx->c;
-	struct bch_ioctl_data_event e = {
-		.type			= BCH_DATA_EVENT_PROGRESS,
-		.p.data_type		= ctx->stats.data_type,
-		.p.btree_id		= ctx->stats.pos.btree,
-		.p.pos			= ctx->stats.pos.pos,
-		.p.sectors_done		= atomic64_read(&ctx->stats.sectors_seen),
-		.p.sectors_total	= bch2_fs_usage_read_short(c).used,
-	};
-
-	if (len < sizeof(e))
-		return -EINVAL;
-
-	return copy_to_user_errcode(buf, &e, sizeof(e)) ?: sizeof(e);
-}
-
-static const struct file_operations bcachefs_data_ops = {
-	.release	= bch2_data_job_release,
-	.read		= bch2_data_job_read,
-	.llseek		= no_llseek,
-};
-
-static long bch2_ioctl_data(struct bch_fs *c,
-			    struct bch_ioctl_data arg)
-{
-	struct bch_data_ctx *ctx;
-	int ret;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if (arg.op >= BCH_DATA_OP_NR || arg.flags)
-		return -EINVAL;
-
-	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
-	if (!ctx)
-		return -ENOMEM;
-
-	ctx->c = c;
-	ctx->arg = arg;
-
-	ret = bch2_run_thread_with_file(&ctx->thr,
-			&bcachefs_data_ops,
-			bch2_data_thread);
-	if (ret < 0)
-		kfree(ctx);
-	return ret;
-}
-
-static long bch2_ioctl_fs_usage(struct bch_fs *c,
-				struct bch_ioctl_fs_usage __user *user_arg)
-{
-	struct bch_ioctl_fs_usage *arg = NULL;
-	struct bch_replicas_usage *dst_e, *dst_end;
-	struct bch_fs_usage_online *src;
-	u32 replica_entries_bytes;
-	unsigned i;
-	int ret = 0;
-
-	if (!test_bit(BCH_FS_started, &c->flags))
-		return -EINVAL;
-
-	if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
-		return -EFAULT;
-
-	arg = kzalloc(size_add(sizeof(*arg), replica_entries_bytes), GFP_KERNEL);
-	if (!arg)
-		return -ENOMEM;
-
-	src = bch2_fs_usage_read(c);
-	if (!src) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	arg->capacity		= c->capacity;
-	arg->used		= bch2_fs_sectors_used(c, src);
-	arg->online_reserved	= src->online_reserved;
-
-	for (i = 0; i < BCH_REPLICAS_MAX; i++)
-		arg->persistent_reserved[i] = src->u.persistent_reserved[i];
-
-	dst_e	= arg->replicas;
-	dst_end = (void *) arg->replicas + replica_entries_bytes;
-
-	for (i = 0; i < c->replicas.nr; i++) {
-		struct bch_replicas_entry_v1 *src_e =
-			cpu_replicas_entry(&c->replicas, i);
-
-		/* check that we have enough space for one replicas entry */
-		if (dst_e + 1 > dst_end) {
-			ret = -ERANGE;
-			break;
-		}
-
-		dst_e->sectors		= src->u.replicas[i];
-		dst_e->r		= *src_e;
-
-		/* recheck after setting nr_devs: */
-		if (replicas_usage_next(dst_e) > dst_end) {
-			ret = -ERANGE;
-			break;
-		}
-
-		memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs);
-
-		dst_e = replicas_usage_next(dst_e);
-	}
-
-	arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas;
-
-	percpu_up_read(&c->mark_lock);
-	kfree(src);
-
-	if (ret)
-		goto err;
-
-	ret = copy_to_user_errcode(user_arg, arg,
-			sizeof(*arg) + arg->replica_entries_bytes);
-err:
-	kfree(arg);
-	return ret;
-}
-
-/* obsolete, didn't allow for new data types: */
-static long bch2_ioctl_dev_usage(struct bch_fs *c,
-				 struct bch_ioctl_dev_usage __user *user_arg)
-{
-	struct bch_ioctl_dev_usage arg;
-	struct bch_dev_usage src;
-	struct bch_dev *ca;
-	unsigned i;
-
-	if (!test_bit(BCH_FS_started, &c->flags))
-		return -EINVAL;
-
-	if (copy_from_user(&arg, user_arg, sizeof(arg)))
-		return -EFAULT;
-
-	if ((arg.flags & ~BCH_BY_INDEX) ||
-	    arg.pad[0] ||
-	    arg.pad[1] ||
-	    arg.pad[2])
-		return -EINVAL;
-
-	ca = bch2_device_lookup(c, arg.dev, arg.flags);
-	if (IS_ERR(ca))
-		return PTR_ERR(ca);
-
-	src = bch2_dev_usage_read(ca);
-
-	arg.state		= ca->mi.state;
-	arg.bucket_size		= ca->mi.bucket_size;
-	arg.nr_buckets		= ca->mi.nbuckets - ca->mi.first_bucket;
-
-	for (i = 0; i < BCH_DATA_NR; i++) {
-		arg.d[i].buckets	= src.d[i].buckets;
-		arg.d[i].sectors	= src.d[i].sectors;
-		arg.d[i].fragmented	= src.d[i].fragmented;
-	}
-
-	bch2_dev_put(ca);
-
-	return copy_to_user_errcode(user_arg, &arg, sizeof(arg));
-}
-
-static long bch2_ioctl_dev_usage_v2(struct bch_fs *c,
-				 struct bch_ioctl_dev_usage_v2 __user *user_arg)
-{
-	struct bch_ioctl_dev_usage_v2 arg;
-	struct bch_dev_usage src;
-	struct bch_dev *ca;
-	int ret = 0;
-
-	if (!test_bit(BCH_FS_started, &c->flags))
-		return -EINVAL;
-
-	if (copy_from_user(&arg, user_arg, sizeof(arg)))
-		return -EFAULT;
-
-	if ((arg.flags & ~BCH_BY_INDEX) ||
-	    arg.pad[0] ||
-	    arg.pad[1] ||
-	    arg.pad[2])
-		return -EINVAL;
-
-	ca = bch2_device_lookup(c, arg.dev, arg.flags);
-	if (IS_ERR(ca))
-		return PTR_ERR(ca);
-
-	src = bch2_dev_usage_read(ca);
-
-	arg.state		= ca->mi.state;
-	arg.bucket_size		= ca->mi.bucket_size;
-	arg.nr_data_types	= min(arg.nr_data_types, BCH_DATA_NR);
-	arg.nr_buckets		= ca->mi.nbuckets - ca->mi.first_bucket;
-
-	ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
-	if (ret)
-		goto err;
-
-	for (unsigned i = 0; i < arg.nr_data_types; i++) {
-		struct bch_ioctl_dev_usage_type t = {
-			.buckets	= src.d[i].buckets,
-			.sectors	= src.d[i].sectors,
-			.fragmented	= src.d[i].fragmented,
-		};
-
-		ret = copy_to_user_errcode(&user_arg->d[i], &t, sizeof(t));
-		if (ret)
-			goto err;
-	}
-err:
-	bch2_dev_put(ca);
-	return ret;
-}
-
-static long bch2_ioctl_read_super(struct bch_fs *c,
-				  struct bch_ioctl_read_super arg)
-{
-	struct bch_dev *ca = NULL;
-	struct bch_sb *sb;
-	int ret = 0;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) ||
-	    arg.pad)
-		return -EINVAL;
-
-	mutex_lock(&c->sb_lock);
-
-	if (arg.flags & BCH_READ_DEV) {
-		ca = bch2_device_lookup(c, arg.dev, arg.flags);
-		ret = PTR_ERR_OR_ZERO(ca);
-		if (ret)
-			goto err_unlock;
-
-		sb = ca->disk_sb.sb;
-	} else {
-		sb = c->disk_sb.sb;
-	}
-
-	if (vstruct_bytes(sb) > arg.size) {
-		ret = -ERANGE;
-		goto err;
-	}
-
-	ret = copy_to_user_errcode((void __user *)(unsigned long)arg.sb, sb,
-				   vstruct_bytes(sb));
-err:
-	bch2_dev_put(ca);
-err_unlock:
-	mutex_unlock(&c->sb_lock);
-	return ret;
-}
-
-static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
-				    struct bch_ioctl_disk_get_idx arg)
-{
-	dev_t dev = huge_decode_dev(arg.dev);
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if (!dev)
-		return -EINVAL;
-
-	for_each_online_member(c, ca)
-		if (ca->dev == dev) {
-			percpu_ref_put(&ca->io_ref);
-			return ca->dev_idx;
-		}
-
-	return -BCH_ERR_ENOENT_dev_idx_not_found;
-}
-
-static long bch2_ioctl_disk_resize(struct bch_fs *c,
-				   struct bch_ioctl_disk_resize arg)
-{
-	struct bch_dev *ca;
-	int ret;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if ((arg.flags & ~BCH_BY_INDEX) ||
-	    arg.pad)
-		return -EINVAL;
-
-	ca = bch2_device_lookup(c, arg.dev, arg.flags);
-	if (IS_ERR(ca))
-		return PTR_ERR(ca);
-
-	ret = bch2_dev_resize(c, ca, arg.nbuckets);
-
-	bch2_dev_put(ca);
-	return ret;
-}
-
-static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
-				   struct bch_ioctl_disk_resize_journal arg)
-{
-	struct bch_dev *ca;
-	int ret;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if ((arg.flags & ~BCH_BY_INDEX) ||
-	    arg.pad)
-		return -EINVAL;
-
-	if (arg.nbuckets > U32_MAX)
-		return -EINVAL;
-
-	ca = bch2_device_lookup(c, arg.dev, arg.flags);
-	if (IS_ERR(ca))
-		return PTR_ERR(ca);
-
-	ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets);
-
-	bch2_dev_put(ca);
-	return ret;
-}
-
-static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
-{
-	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
-	struct bch_fs *c = thr->c;
-
-	c->stdio_filter = current;
-	c->stdio = &thr->thr.stdio;
-
-	/*
-	 * XXX: can we figure out a way to do this without mucking with c->opts?
-	 */
-	unsigned old_fix_errors = c->opts.fix_errors;
-	if (opt_defined(thr->opts, fix_errors))
-		c->opts.fix_errors = thr->opts.fix_errors;
-	else
-		c->opts.fix_errors = FSCK_FIX_ask;
-
-	c->opts.fsck = true;
-	set_bit(BCH_FS_fsck_running, &c->flags);
-
-	c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
-	int ret = bch2_run_online_recovery_passes(c);
-
-	clear_bit(BCH_FS_fsck_running, &c->flags);
-	bch_err_fn(c, ret);
-
-	c->stdio = NULL;
-	c->stdio_filter = NULL;
-	c->opts.fix_errors = old_fix_errors;
-
-	up(&c->online_fsck_mutex);
-	bch2_ro_ref_put(c);
-	return ret;
-}
-
-static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
-	.exit		= bch2_fsck_thread_exit,
-	.fn		= bch2_fsck_online_thread_fn,
-};
-
-static long bch2_ioctl_fsck_online(struct bch_fs *c,
-				   struct bch_ioctl_fsck_online arg)
-{
-	struct fsck_thread *thr = NULL;
-	long ret = 0;
-
-	if (arg.flags)
-		return -EINVAL;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if (!bch2_ro_ref_tryget(c))
-		return -EROFS;
-
-	if (down_trylock(&c->online_fsck_mutex)) {
-		bch2_ro_ref_put(c);
-		return -EAGAIN;
-	}
-
-	thr = kzalloc(sizeof(*thr), GFP_KERNEL);
-	if (!thr) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	thr->c = c;
-	thr->opts = bch2_opts_empty();
-
-	if (arg.opts) {
-		char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
-
-		ret =   PTR_ERR_OR_ZERO(optstr) ?:
-			bch2_parse_mount_opts(c, &thr->opts, optstr);
-		kfree(optstr);
-
-		if (ret)
-			goto err;
-	}
-
-	ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
-err:
-	if (ret < 0) {
-		bch_err_fn(c, ret);
-		if (thr)
-			bch2_fsck_thread_exit(&thr->thr);
-		up(&c->online_fsck_mutex);
-		bch2_ro_ref_put(c);
-	}
-	return ret;
-}
-
-#define BCH_IOCTL(_name, _argtype)					\
-do {									\
-	_argtype i;							\
-									\
-	if (copy_from_user(&i, arg, sizeof(i)))				\
-		return -EFAULT;						\
-	ret = bch2_ioctl_##_name(c, i);					\
-	goto out;							\
-} while (0)
-
-long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
-{
-	long ret;
-
-	switch (cmd) {
-	case BCH_IOCTL_QUERY_UUID:
-		return bch2_ioctl_query_uuid(c, arg);
-	case BCH_IOCTL_FS_USAGE:
-		return bch2_ioctl_fs_usage(c, arg);
-	case BCH_IOCTL_DEV_USAGE:
-		return bch2_ioctl_dev_usage(c, arg);
-	case BCH_IOCTL_DEV_USAGE_V2:
-		return bch2_ioctl_dev_usage_v2(c, arg);
-#if 0
-	case BCH_IOCTL_START:
-		BCH_IOCTL(start, struct bch_ioctl_start);
-	case BCH_IOCTL_STOP:
-		return bch2_ioctl_stop(c);
-#endif
-	case BCH_IOCTL_READ_SUPER:
-		BCH_IOCTL(read_super, struct bch_ioctl_read_super);
-	case BCH_IOCTL_DISK_GET_IDX:
-		BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
-	}
-
-	if (!test_bit(BCH_FS_started, &c->flags))
-		return -EINVAL;
-
-	switch (cmd) {
-	case BCH_IOCTL_DISK_ADD:
-		BCH_IOCTL(disk_add, struct bch_ioctl_disk);
-	case BCH_IOCTL_DISK_REMOVE:
-		BCH_IOCTL(disk_remove, struct bch_ioctl_disk);
-	case BCH_IOCTL_DISK_ONLINE:
-		BCH_IOCTL(disk_online, struct bch_ioctl_disk);
-	case BCH_IOCTL_DISK_OFFLINE:
-		BCH_IOCTL(disk_offline, struct bch_ioctl_disk);
-	case BCH_IOCTL_DISK_SET_STATE:
-		BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state);
-	case BCH_IOCTL_DATA:
-		BCH_IOCTL(data, struct bch_ioctl_data);
-	case BCH_IOCTL_DISK_RESIZE:
-		BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
-	case BCH_IOCTL_DISK_RESIZE_JOURNAL:
-		BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
-	case BCH_IOCTL_FSCK_ONLINE:
-		BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online);
-	default:
-		return -ENOTTY;
-	}
-out:
-	if (ret < 0)
-		ret = bch2_err_class(ret);
-	return ret;
-}
-
-static DEFINE_IDR(bch_chardev_minor);
-
-static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v)
-{
-	unsigned minor = iminor(file_inode(filp));
-	struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL;
-	void __user *arg = (void __user *) v;
-
-	return c
-		? bch2_fs_ioctl(c, cmd, arg)
-		: bch2_global_ioctl(cmd, arg);
-}
-
-static const struct file_operations bch_chardev_fops = {
-	.owner		= THIS_MODULE,
-	.unlocked_ioctl = bch2_chardev_ioctl,
-	.open		= nonseekable_open,
-};
-
-static int bch_chardev_major;
-static const struct class bch_chardev_class = {
-	.name = "bcachefs",
-};
-static struct device *bch_chardev;
-
-void bch2_fs_chardev_exit(struct bch_fs *c)
-{
-	if (!IS_ERR_OR_NULL(c->chardev))
-		device_unregister(c->chardev);
-	if (c->minor >= 0)
-		idr_remove(&bch_chardev_minor, c->minor);
-}
-
-int bch2_fs_chardev_init(struct bch_fs *c)
-{
-	c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL);
-	if (c->minor < 0)
-		return c->minor;
-
-	c->chardev = device_create(&bch_chardev_class, NULL,
-				   MKDEV(bch_chardev_major, c->minor), c,
-				   "bcachefs%u-ctl", c->minor);
-	if (IS_ERR(c->chardev))
-		return PTR_ERR(c->chardev);
-
-	return 0;
-}
-
-void bch2_chardev_exit(void)
-{
-	device_destroy(&bch_chardev_class, MKDEV(bch_chardev_major, U8_MAX));
-	class_unregister(&bch_chardev_class);
-	if (bch_chardev_major > 0)
-		unregister_chrdev(bch_chardev_major, "bcachefs");
-}
-
-int __init bch2_chardev_init(void)
-{
-	int ret;
-
-	bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops);
-	if (bch_chardev_major < 0)
-		return bch_chardev_major;
-
-	ret = class_register(&bch_chardev_class);
-	if (ret)
-		goto major_out;
-
-	bch_chardev = device_create(&bch_chardev_class, NULL,
-				    MKDEV(bch_chardev_major, U8_MAX),
-				    NULL, "bcachefs-ctl");
-	if (IS_ERR(bch_chardev)) {
-		ret = PTR_ERR(bch_chardev);
-		goto class_out;
-	}
-
-	return 0;
-
-class_out:
-	class_unregister(&bch_chardev_class);
-major_out:
-	unregister_chrdev(bch_chardev_major, "bcachefs-ctl");
-	return ret;
-}
-
-#endif /* NO_BCACHEFS_CHARDEV */
diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h
deleted file mode 100644
index 0f563ca53c36..000000000000
--- a/fs/bcachefs/chardev.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_CHARDEV_H
-#define _BCACHEFS_CHARDEV_H
-
-#ifndef NO_BCACHEFS_FS
-
-long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *);
-
-void bch2_fs_chardev_exit(struct bch_fs *);
-int bch2_fs_chardev_init(struct bch_fs *);
-
-void bch2_chardev_exit(void);
-int __init bch2_chardev_init(void);
-
-#else
-
-static inline long bch2_fs_ioctl(struct bch_fs *c,
-				unsigned cmd, void __user * arg)
-{
-	return -ENOTTY;
-}
-
-static inline void bch2_fs_chardev_exit(struct bch_fs *c) {}
-static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; }
-
-static inline void bch2_chardev_exit(void) {}
-static inline int __init bch2_chardev_init(void) { return 0; }
-
-#endif /* NO_BCACHEFS_FS */
-
-#endif /* _BCACHEFS_CHARDEV_H */
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
deleted file mode 100644
index 85198f391e9c..000000000000
--- a/fs/bcachefs/checksum.c
+++ /dev/null
@@ -1,805 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "checksum.h"
-#include "errcode.h"
-#include "super.h"
-#include "super-io.h"
-
-#include <linux/crc32c.h>
-#include <linux/crypto.h>
-#include <linux/xxhash.h>
-#include <linux/key.h>
-#include <linux/random.h>
-#include <linux/scatterlist.h>
-#include <crypto/algapi.h>
-#include <crypto/chacha.h>
-#include <crypto/hash.h>
-#include <crypto/poly1305.h>
-#include <crypto/skcipher.h>
-#include <keys/user-type.h>
-
-/*
- * bch2_checksum state is an abstraction of the checksum state calculated over different pages.
- * it features page merging without having the checksum algorithm lose its state.
- * for native checksum aglorithms (like crc), a default seed value will do.
- * for hash-like algorithms, a state needs to be stored
- */
-
-struct bch2_checksum_state {
-	union {
-		u64 seed;
-		struct xxh64_state h64state;
-	};
-	unsigned int type;
-};
-
-static void bch2_checksum_init(struct bch2_checksum_state *state)
-{
-	switch (state->type) {
-	case BCH_CSUM_none:
-	case BCH_CSUM_crc32c:
-	case BCH_CSUM_crc64:
-		state->seed = 0;
-		break;
-	case BCH_CSUM_crc32c_nonzero:
-		state->seed = U32_MAX;
-		break;
-	case BCH_CSUM_crc64_nonzero:
-		state->seed = U64_MAX;
-		break;
-	case BCH_CSUM_xxhash:
-		xxh64_reset(&state->h64state, 0);
-		break;
-	default:
-		BUG();
-	}
-}
-
-static u64 bch2_checksum_final(const struct bch2_checksum_state *state)
-{
-	switch (state->type) {
-	case BCH_CSUM_none:
-	case BCH_CSUM_crc32c:
-	case BCH_CSUM_crc64:
-		return state->seed;
-	case BCH_CSUM_crc32c_nonzero:
-		return state->seed ^ U32_MAX;
-	case BCH_CSUM_crc64_nonzero:
-		return state->seed ^ U64_MAX;
-	case BCH_CSUM_xxhash:
-		return xxh64_digest(&state->h64state);
-	default:
-		BUG();
-	}
-}
-
-static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len)
-{
-	switch (state->type) {
-	case BCH_CSUM_none:
-		return;
-	case BCH_CSUM_crc32c_nonzero:
-	case BCH_CSUM_crc32c:
-		state->seed = crc32c(state->seed, data, len);
-		break;
-	case BCH_CSUM_crc64_nonzero:
-	case BCH_CSUM_crc64:
-		state->seed = crc64_be(state->seed, data, len);
-		break;
-	case BCH_CSUM_xxhash:
-		xxh64_update(&state->h64state, data, len);
-		break;
-	default:
-		BUG();
-	}
-}
-
-static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
-				struct nonce nonce,
-				struct scatterlist *sg, size_t len)
-{
-	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
-	int ret;
-
-	skcipher_request_set_sync_tfm(req, tfm);
-	skcipher_request_set_callback(req, 0, NULL, NULL);
-	skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
-
-	ret = crypto_skcipher_encrypt(req);
-	if (ret)
-		pr_err("got error %i from crypto_skcipher_encrypt()", ret);
-
-	return ret;
-}
-
-static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
-			      struct nonce nonce,
-			      void *buf, size_t len)
-{
-	if (!is_vmalloc_addr(buf)) {
-		struct scatterlist sg;
-
-		sg_init_table(&sg, 1);
-		sg_set_page(&sg,
-			    is_vmalloc_addr(buf)
-			    ? vmalloc_to_page(buf)
-			    : virt_to_page(buf),
-			    len, offset_in_page(buf));
-		return do_encrypt_sg(tfm, nonce, &sg, len);
-	} else {
-		unsigned pages = buf_pages(buf, len);
-		struct scatterlist *sg;
-		size_t orig_len = len;
-		int ret, i;
-
-		sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL);
-		if (!sg)
-			return -BCH_ERR_ENOMEM_do_encrypt;
-
-		sg_init_table(sg, pages);
-
-		for (i = 0; i < pages; i++) {
-			unsigned offset = offset_in_page(buf);
-			unsigned pg_len = min_t(size_t, len, PAGE_SIZE - offset);
-
-			sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset);
-			buf += pg_len;
-			len -= pg_len;
-		}
-
-		ret = do_encrypt_sg(tfm, nonce, sg, orig_len);
-		kfree(sg);
-		return ret;
-	}
-}
-
-int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
-			    void *buf, size_t len)
-{
-	struct crypto_sync_skcipher *chacha20 =
-		crypto_alloc_sync_skcipher("chacha20", 0, 0);
-	int ret;
-
-	ret = PTR_ERR_OR_ZERO(chacha20);
-	if (ret) {
-		pr_err("error requesting chacha20 cipher: %s", bch2_err_str(ret));
-		return ret;
-	}
-
-	ret = crypto_skcipher_setkey(&chacha20->base,
-				     (void *) key, sizeof(*key));
-	if (ret) {
-		pr_err("error from crypto_skcipher_setkey(): %s", bch2_err_str(ret));
-		goto err;
-	}
-
-	ret = do_encrypt(chacha20, nonce, buf, len);
-err:
-	crypto_free_sync_skcipher(chacha20);
-	return ret;
-}
-
-static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
-			struct nonce nonce)
-{
-	u8 key[POLY1305_KEY_SIZE];
-	int ret;
-
-	nonce.d[3] ^= BCH_NONCE_POLY;
-
-	memset(key, 0, sizeof(key));
-	ret = do_encrypt(c->chacha20, nonce, key, sizeof(key));
-	if (ret)
-		return ret;
-
-	desc->tfm = c->poly1305;
-	crypto_shash_init(desc);
-	crypto_shash_update(desc, key, sizeof(key));
-	return 0;
-}
-
-struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
-			      struct nonce nonce, const void *data, size_t len)
-{
-	switch (type) {
-	case BCH_CSUM_none:
-	case BCH_CSUM_crc32c_nonzero:
-	case BCH_CSUM_crc64_nonzero:
-	case BCH_CSUM_crc32c:
-	case BCH_CSUM_xxhash:
-	case BCH_CSUM_crc64: {
-		struct bch2_checksum_state state;
-
-		state.type = type;
-
-		bch2_checksum_init(&state);
-		bch2_checksum_update(&state, data, len);
-
-		return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
-	}
-
-	case BCH_CSUM_chacha20_poly1305_80:
-	case BCH_CSUM_chacha20_poly1305_128: {
-		SHASH_DESC_ON_STACK(desc, c->poly1305);
-		u8 digest[POLY1305_DIGEST_SIZE];
-		struct bch_csum ret = { 0 };
-
-		gen_poly_key(c, desc, nonce);
-
-		crypto_shash_update(desc, data, len);
-		crypto_shash_final(desc, digest);
-
-		memcpy(&ret, digest, bch_crc_bytes[type]);
-		return ret;
-	}
-	default:
-		BUG();
-	}
-}
-
-int bch2_encrypt(struct bch_fs *c, unsigned type,
-		  struct nonce nonce, void *data, size_t len)
-{
-	if (!bch2_csum_type_is_encryption(type))
-		return 0;
-
-	return do_encrypt(c->chacha20, nonce, data, len);
-}
-
-static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
-					   struct nonce nonce, struct bio *bio,
-					   struct bvec_iter *iter)
-{
-	struct bio_vec bv;
-
-	switch (type) {
-	case BCH_CSUM_none:
-		return (struct bch_csum) { 0 };
-	case BCH_CSUM_crc32c_nonzero:
-	case BCH_CSUM_crc64_nonzero:
-	case BCH_CSUM_crc32c:
-	case BCH_CSUM_xxhash:
-	case BCH_CSUM_crc64: {
-		struct bch2_checksum_state state;
-
-		state.type = type;
-		bch2_checksum_init(&state);
-
-#ifdef CONFIG_HIGHMEM
-		__bio_for_each_segment(bv, bio, *iter, *iter) {
-			void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
-
-			bch2_checksum_update(&state, p, bv.bv_len);
-			kunmap_local(p);
-		}
-#else
-		__bio_for_each_bvec(bv, bio, *iter, *iter)
-			bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset,
-				bv.bv_len);
-#endif
-		return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
-	}
-
-	case BCH_CSUM_chacha20_poly1305_80:
-	case BCH_CSUM_chacha20_poly1305_128: {
-		SHASH_DESC_ON_STACK(desc, c->poly1305);
-		u8 digest[POLY1305_DIGEST_SIZE];
-		struct bch_csum ret = { 0 };
-
-		gen_poly_key(c, desc, nonce);
-
-#ifdef CONFIG_HIGHMEM
-		__bio_for_each_segment(bv, bio, *iter, *iter) {
-			void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
-
-			crypto_shash_update(desc, p, bv.bv_len);
-			kunmap_local(p);
-		}
-#else
-		__bio_for_each_bvec(bv, bio, *iter, *iter)
-			crypto_shash_update(desc,
-				page_address(bv.bv_page) + bv.bv_offset,
-				bv.bv_len);
-#endif
-		crypto_shash_final(desc, digest);
-
-		memcpy(&ret, digest, bch_crc_bytes[type]);
-		return ret;
-	}
-	default:
-		BUG();
-	}
-}
-
-struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
-				  struct nonce nonce, struct bio *bio)
-{
-	struct bvec_iter iter = bio->bi_iter;
-
-	return __bch2_checksum_bio(c, type, nonce, bio, &iter);
-}
-
-int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
-		     struct nonce nonce, struct bio *bio)
-{
-	struct bio_vec bv;
-	struct bvec_iter iter;
-	struct scatterlist sgl[16], *sg = sgl;
-	size_t bytes = 0;
-	int ret = 0;
-
-	if (!bch2_csum_type_is_encryption(type))
-		return 0;
-
-	sg_init_table(sgl, ARRAY_SIZE(sgl));
-
-	bio_for_each_segment(bv, bio, iter) {
-		if (sg == sgl + ARRAY_SIZE(sgl)) {
-			sg_mark_end(sg - 1);
-
-			ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
-			if (ret)
-				return ret;
-
-			nonce = nonce_add(nonce, bytes);
-			bytes = 0;
-
-			sg_init_table(sgl, ARRAY_SIZE(sgl));
-			sg = sgl;
-		}
-
-		sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
-		bytes += bv.bv_len;
-	}
-
-	sg_mark_end(sg - 1);
-	return do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
-}
-
-struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
-				    struct bch_csum b, size_t b_len)
-{
-	struct bch2_checksum_state state;
-
-	state.type = type;
-	bch2_checksum_init(&state);
-	state.seed = le64_to_cpu(a.lo);
-
-	BUG_ON(!bch2_checksum_mergeable(type));
-
-	while (b_len) {
-		unsigned page_len = min_t(unsigned, b_len, PAGE_SIZE);
-
-		bch2_checksum_update(&state,
-				page_address(ZERO_PAGE(0)), page_len);
-		b_len -= page_len;
-	}
-	a.lo = cpu_to_le64(bch2_checksum_final(&state));
-	a.lo ^= b.lo;
-	a.hi ^= b.hi;
-	return a;
-}
-
-int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
-			struct bversion version,
-			struct bch_extent_crc_unpacked crc_old,
-			struct bch_extent_crc_unpacked *crc_a,
-			struct bch_extent_crc_unpacked *crc_b,
-			unsigned len_a, unsigned len_b,
-			unsigned new_csum_type)
-{
-	struct bvec_iter iter = bio->bi_iter;
-	struct nonce nonce = extent_nonce(version, crc_old);
-	struct bch_csum merged = { 0 };
-	struct crc_split {
-		struct bch_extent_crc_unpacked	*crc;
-		unsigned			len;
-		unsigned			csum_type;
-		struct bch_csum			csum;
-	} splits[3] = {
-		{ crc_a, len_a, new_csum_type, { 0 }},
-		{ crc_b, len_b, new_csum_type, { 0 } },
-		{ NULL,	 bio_sectors(bio) - len_a - len_b, new_csum_type, { 0 } },
-	}, *i;
-	bool mergeable = crc_old.csum_type == new_csum_type &&
-		bch2_checksum_mergeable(new_csum_type);
-	unsigned crc_nonce = crc_old.nonce;
-
-	BUG_ON(len_a + len_b > bio_sectors(bio));
-	BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
-	BUG_ON(crc_is_compressed(crc_old));
-	BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
-	       bch2_csum_type_is_encryption(new_csum_type));
-
-	for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
-		iter.bi_size = i->len << 9;
-		if (mergeable || i->crc)
-			i->csum = __bch2_checksum_bio(c, i->csum_type,
-						      nonce, bio, &iter);
-		else
-			bio_advance_iter(bio, &iter, i->len << 9);
-		nonce = nonce_add(nonce, i->len << 9);
-	}
-
-	if (mergeable)
-		for (i = splits; i < splits + ARRAY_SIZE(splits); i++)
-			merged = bch2_checksum_merge(new_csum_type, merged,
-						     i->csum, i->len << 9);
-	else
-		merged = bch2_checksum_bio(c, crc_old.csum_type,
-				extent_nonce(version, crc_old), bio);
-
-	if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
-		struct printbuf buf = PRINTBUF;
-		prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n"
-			   "expected %0llx:%0llx got %0llx:%0llx (old type ",
-			   __func__,
-			   crc_old.csum.hi,
-			   crc_old.csum.lo,
-			   merged.hi,
-			   merged.lo);
-		bch2_prt_csum_type(&buf, crc_old.csum_type);
-		prt_str(&buf, " new type ");
-		bch2_prt_csum_type(&buf, new_csum_type);
-		prt_str(&buf, ")");
-		bch_err(c, "%s", buf.buf);
-		printbuf_exit(&buf);
-		return -EIO;
-	}
-
-	for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
-		if (i->crc)
-			*i->crc = (struct bch_extent_crc_unpacked) {
-				.csum_type		= i->csum_type,
-				.compression_type	= crc_old.compression_type,
-				.compressed_size	= i->len,
-				.uncompressed_size	= i->len,
-				.offset			= 0,
-				.live_size		= i->len,
-				.nonce			= crc_nonce,
-				.csum			= i->csum,
-			};
-
-		if (bch2_csum_type_is_encryption(new_csum_type))
-			crc_nonce += i->len;
-	}
-
-	return 0;
-}
-
-/* BCH_SB_FIELD_crypt: */
-
-static int bch2_sb_crypt_validate(struct bch_sb *sb, struct bch_sb_field *f,
-				  enum bch_validate_flags flags, struct printbuf *err)
-{
-	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
-
-	if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
-		prt_printf(err, "wrong size (got %zu should be %zu)",
-		       vstruct_bytes(&crypt->field), sizeof(*crypt));
-		return -BCH_ERR_invalid_sb_crypt;
-	}
-
-	if (BCH_CRYPT_KDF_TYPE(crypt)) {
-		prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
-		return -BCH_ERR_invalid_sb_crypt;
-	}
-
-	return 0;
-}
-
-static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
-				  struct bch_sb_field *f)
-{
-	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
-
-	prt_printf(out, "KFD:               %llu\n", BCH_CRYPT_KDF_TYPE(crypt));
-	prt_printf(out, "scrypt n:          %llu\n", BCH_KDF_SCRYPT_N(crypt));
-	prt_printf(out, "scrypt r:          %llu\n", BCH_KDF_SCRYPT_R(crypt));
-	prt_printf(out, "scrypt p:          %llu\n", BCH_KDF_SCRYPT_P(crypt));
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
-	.validate	= bch2_sb_crypt_validate,
-	.to_text	= bch2_sb_crypt_to_text,
-};
-
-#ifdef __KERNEL__
-static int __bch2_request_key(char *key_description, struct bch_key *key)
-{
-	struct key *keyring_key;
-	const struct user_key_payload *ukp;
-	int ret;
-
-	keyring_key = request_key(&key_type_user, key_description, NULL);
-	if (IS_ERR(keyring_key))
-		return PTR_ERR(keyring_key);
-
-	down_read(&keyring_key->sem);
-	ukp = dereference_key_locked(keyring_key);
-	if (ukp->datalen == sizeof(*key)) {
-		memcpy(key, ukp->data, ukp->datalen);
-		ret = 0;
-	} else {
-		ret = -EINVAL;
-	}
-	up_read(&keyring_key->sem);
-	key_put(keyring_key);
-
-	return ret;
-}
-#else
-#include <keyutils.h>
-
-static int __bch2_request_key(char *key_description, struct bch_key *key)
-{
-	key_serial_t key_id;
-
-	key_id = request_key("user", key_description, NULL,
-			     KEY_SPEC_SESSION_KEYRING);
-	if (key_id >= 0)
-		goto got_key;
-
-	key_id = request_key("user", key_description, NULL,
-			     KEY_SPEC_USER_KEYRING);
-	if (key_id >= 0)
-		goto got_key;
-
-	key_id = request_key("user", key_description, NULL,
-			     KEY_SPEC_USER_SESSION_KEYRING);
-	if (key_id >= 0)
-		goto got_key;
-
-	return -errno;
-got_key:
-
-	if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key))
-		return -1;
-
-	return 0;
-}
-
-#include "crypto.h"
-#endif
-
-int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
-{
-	struct printbuf key_description = PRINTBUF;
-	int ret;
-
-	prt_printf(&key_description, "bcachefs:");
-	pr_uuid(&key_description, sb->user_uuid.b);
-
-	ret = __bch2_request_key(key_description.buf, key);
-	printbuf_exit(&key_description);
-
-#ifndef __KERNEL__
-	if (ret) {
-		char *passphrase = read_passphrase("Enter passphrase: ");
-		struct bch_encrypted_key sb_key;
-
-		bch2_passphrase_check(sb, passphrase,
-				      key, &sb_key);
-		ret = 0;
-	}
-#endif
-
-	/* stash with memfd, pass memfd fd to mount */
-
-	return ret;
-}
-
-#ifndef __KERNEL__
-int bch2_revoke_key(struct bch_sb *sb)
-{
-	key_serial_t key_id;
-	struct printbuf key_description = PRINTBUF;
-
-	prt_printf(&key_description, "bcachefs:");
-	pr_uuid(&key_description, sb->user_uuid.b);
-
-	key_id = request_key("user", key_description.buf, NULL, KEY_SPEC_USER_KEYRING);
-	printbuf_exit(&key_description);
-	if (key_id < 0)
-		return errno;
-
-	keyctl_revoke(key_id);
-
-	return 0;
-}
-#endif
-
-int bch2_decrypt_sb_key(struct bch_fs *c,
-			struct bch_sb_field_crypt *crypt,
-			struct bch_key *key)
-{
-	struct bch_encrypted_key sb_key = crypt->key;
-	struct bch_key user_key;
-	int ret = 0;
-
-	/* is key encrypted? */
-	if (!bch2_key_is_encrypted(&sb_key))
-		goto out;
-
-	ret = bch2_request_key(c->disk_sb.sb, &user_key);
-	if (ret) {
-		bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
-		goto err;
-	}
-
-	/* decrypt real key: */
-	ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
-				      &sb_key, sizeof(sb_key));
-	if (ret)
-		goto err;
-
-	if (bch2_key_is_encrypted(&sb_key)) {
-		bch_err(c, "incorrect encryption key");
-		ret = -EINVAL;
-		goto err;
-	}
-out:
-	*key = sb_key.key;
-err:
-	memzero_explicit(&sb_key, sizeof(sb_key));
-	memzero_explicit(&user_key, sizeof(user_key));
-	return ret;
-}
-
-static int bch2_alloc_ciphers(struct bch_fs *c)
-{
-	int ret;
-
-	if (!c->chacha20)
-		c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
-	ret = PTR_ERR_OR_ZERO(c->chacha20);
-
-	if (ret) {
-		bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret));
-		return ret;
-	}
-
-	if (!c->poly1305)
-		c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
-	ret = PTR_ERR_OR_ZERO(c->poly1305);
-
-	if (ret) {
-		bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret));
-		return ret;
-	}
-
-	return 0;
-}
-
-int bch2_disable_encryption(struct bch_fs *c)
-{
-	struct bch_sb_field_crypt *crypt;
-	struct bch_key key;
-	int ret = -EINVAL;
-
-	mutex_lock(&c->sb_lock);
-
-	crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
-	if (!crypt)
-		goto out;
-
-	/* is key encrypted? */
-	ret = 0;
-	if (bch2_key_is_encrypted(&crypt->key))
-		goto out;
-
-	ret = bch2_decrypt_sb_key(c, crypt, &key);
-	if (ret)
-		goto out;
-
-	crypt->key.magic	= cpu_to_le64(BCH_KEY_MAGIC);
-	crypt->key.key		= key;
-
-	SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
-	bch2_write_super(c);
-out:
-	mutex_unlock(&c->sb_lock);
-
-	return ret;
-}
-
-int bch2_enable_encryption(struct bch_fs *c, bool keyed)
-{
-	struct bch_encrypted_key key;
-	struct bch_key user_key;
-	struct bch_sb_field_crypt *crypt;
-	int ret = -EINVAL;
-
-	mutex_lock(&c->sb_lock);
-
-	/* Do we already have an encryption key? */
-	if (bch2_sb_field_get(c->disk_sb.sb, crypt))
-		goto err;
-
-	ret = bch2_alloc_ciphers(c);
-	if (ret)
-		goto err;
-
-	key.magic = cpu_to_le64(BCH_KEY_MAGIC);
-	get_random_bytes(&key.key, sizeof(key.key));
-
-	if (keyed) {
-		ret = bch2_request_key(c->disk_sb.sb, &user_key);
-		if (ret) {
-			bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
-			goto err;
-		}
-
-		ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
-					      &key, sizeof(key));
-		if (ret)
-			goto err;
-	}
-
-	ret = crypto_skcipher_setkey(&c->chacha20->base,
-			(void *) &key.key, sizeof(key.key));
-	if (ret)
-		goto err;
-
-	crypt = bch2_sb_field_resize(&c->disk_sb, crypt,
-				     sizeof(*crypt) / sizeof(u64));
-	if (!crypt) {
-		ret = -BCH_ERR_ENOSPC_sb_crypt;
-		goto err;
-	}
-
-	crypt->key = key;
-
-	/* write superblock */
-	SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1);
-	bch2_write_super(c);
-err:
-	mutex_unlock(&c->sb_lock);
-	memzero_explicit(&user_key, sizeof(user_key));
-	memzero_explicit(&key, sizeof(key));
-	return ret;
-}
-
-void bch2_fs_encryption_exit(struct bch_fs *c)
-{
-	if (!IS_ERR_OR_NULL(c->poly1305))
-		crypto_free_shash(c->poly1305);
-	if (!IS_ERR_OR_NULL(c->chacha20))
-		crypto_free_sync_skcipher(c->chacha20);
-	if (!IS_ERR_OR_NULL(c->sha256))
-		crypto_free_shash(c->sha256);
-}
-
-int bch2_fs_encryption_init(struct bch_fs *c)
-{
-	struct bch_sb_field_crypt *crypt;
-	struct bch_key key;
-	int ret = 0;
-
-	c->sha256 = crypto_alloc_shash("sha256", 0, 0);
-	ret = PTR_ERR_OR_ZERO(c->sha256);
-	if (ret) {
-		bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret));
-		goto out;
-	}
-
-	crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
-	if (!crypt)
-		goto out;
-
-	ret = bch2_alloc_ciphers(c);
-	if (ret)
-		goto out;
-
-	ret = bch2_decrypt_sb_key(c, crypt, &key);
-	if (ret)
-		goto out;
-
-	ret = crypto_skcipher_setkey(&c->chacha20->base,
-			(void *) &key.key, sizeof(key.key));
-	if (ret)
-		goto out;
-out:
-	memzero_explicit(&key, sizeof(key));
-	return ret;
-}
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
deleted file mode 100644
index e40499fde9a4..000000000000
--- a/fs/bcachefs/checksum.h
+++ /dev/null
@@ -1,237 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_CHECKSUM_H
-#define _BCACHEFS_CHECKSUM_H
-
-#include "bcachefs.h"
-#include "extents_types.h"
-#include "super-io.h"
-
-#include <linux/crc64.h>
-#include <crypto/chacha.h>
-
-static inline bool bch2_checksum_mergeable(unsigned type)
-{
-
-	switch (type) {
-	case BCH_CSUM_none:
-	case BCH_CSUM_crc32c:
-	case BCH_CSUM_crc64:
-		return true;
-	default:
-		return false;
-	}
-}
-
-struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum,
-				    struct bch_csum, size_t);
-
-#define BCH_NONCE_EXTENT	cpu_to_le32(1 << 28)
-#define BCH_NONCE_BTREE		cpu_to_le32(2 << 28)
-#define BCH_NONCE_JOURNAL	cpu_to_le32(3 << 28)
-#define BCH_NONCE_PRIO		cpu_to_le32(4 << 28)
-#define BCH_NONCE_POLY		cpu_to_le32(1 << 31)
-
-struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
-			     const void *, size_t);
-
-/*
- * This is used for various on disk data structures - bch_sb, prio_set, bset,
- * jset: The checksum is _always_ the first field of these structs
- */
-#define csum_vstruct(_c, _type, _nonce, _i)				\
-({									\
-	const void *_start = ((const void *) (_i)) + sizeof((_i)->csum);\
-									\
-	bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\
-})
-
-static inline void bch2_csum_to_text(struct printbuf *out,
-				     enum bch_csum_type type,
-				     struct bch_csum csum)
-{
-	const u8 *p = (u8 *) &csum;
-	unsigned bytes = type < BCH_CSUM_NR ? bch_crc_bytes[type] : 16;
-
-	for (unsigned i = 0; i < bytes; i++)
-		prt_hex_byte(out, p[i]);
-}
-
-static inline void bch2_csum_err_msg(struct printbuf *out,
-				     enum bch_csum_type type,
-				     struct bch_csum expected,
-				     struct bch_csum got)
-{
-	prt_str(out, "checksum error, type ");
-	bch2_prt_csum_type(out, type);
-	prt_str(out, ": got ");
-	bch2_csum_to_text(out, type, got);
-	prt_str(out, " should be ");
-	bch2_csum_to_text(out, type, expected);
-}
-
-int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
-int bch2_request_key(struct bch_sb *, struct bch_key *);
-#ifndef __KERNEL__
-int bch2_revoke_key(struct bch_sb *);
-#endif
-
-int bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
-		 void *data, size_t);
-
-struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
-				  struct nonce, struct bio *);
-
-int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
-			struct bch_extent_crc_unpacked,
-			struct bch_extent_crc_unpacked *,
-			struct bch_extent_crc_unpacked *,
-			unsigned, unsigned, unsigned);
-
-int __bch2_encrypt_bio(struct bch_fs *, unsigned,
-		       struct nonce, struct bio *);
-
-static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
-				   struct nonce nonce, struct bio *bio)
-{
-	return bch2_csum_type_is_encryption(type)
-		? __bch2_encrypt_bio(c, type, nonce, bio)
-		: 0;
-}
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_crypt;
-
-int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
-			struct bch_key *);
-
-int bch2_disable_encryption(struct bch_fs *);
-int bch2_enable_encryption(struct bch_fs *, bool);
-
-void bch2_fs_encryption_exit(struct bch_fs *);
-int bch2_fs_encryption_init(struct bch_fs *);
-
-static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
-						       bool data)
-{
-	switch (type) {
-	case BCH_CSUM_OPT_none:
-		return BCH_CSUM_none;
-	case BCH_CSUM_OPT_crc32c:
-		return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero;
-	case BCH_CSUM_OPT_crc64:
-		return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero;
-	case BCH_CSUM_OPT_xxhash:
-		return BCH_CSUM_xxhash;
-	default:
-		BUG();
-	}
-}
-
-static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
-							 struct bch_io_opts opts)
-{
-	if (opts.nocow)
-		return 0;
-
-	if (c->sb.encryption_type)
-		return c->opts.wide_macs
-			? BCH_CSUM_chacha20_poly1305_128
-			: BCH_CSUM_chacha20_poly1305_80;
-
-	return bch2_csum_opt_to_type(opts.data_checksum, true);
-}
-
-static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
-{
-	if (c->sb.encryption_type)
-		return BCH_CSUM_chacha20_poly1305_128;
-
-	return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
-}
-
-static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
-					   unsigned type)
-{
-	if (type >= BCH_CSUM_NR)
-		return false;
-
-	if (bch2_csum_type_is_encryption(type) && !c->chacha20)
-		return false;
-
-	return true;
-}
-
-/* returns true if not equal */
-static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
-{
-	/*
-	 * XXX: need some way of preventing the compiler from optimizing this
-	 * into a form that isn't constant time..
-	 */
-	return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0;
-}
-
-/* for skipping ahead and encrypting/decrypting at an offset: */
-static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
-{
-	EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
-
-	le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
-	return nonce;
-}
-
-static inline struct nonce null_nonce(void)
-{
-	struct nonce ret;
-
-	memset(&ret, 0, sizeof(ret));
-	return ret;
-}
-
-static inline struct nonce extent_nonce(struct bversion version,
-					struct bch_extent_crc_unpacked crc)
-{
-	unsigned compression_type = crc_is_compressed(crc)
-		? crc.compression_type
-		: 0;
-	unsigned size = compression_type ? crc.uncompressed_size : 0;
-	struct nonce nonce = (struct nonce) {{
-		[0] = cpu_to_le32(size << 22),
-		[1] = cpu_to_le32(version.lo),
-		[2] = cpu_to_le32(version.lo >> 32),
-		[3] = cpu_to_le32(version.hi|
-				  (compression_type << 24))^BCH_NONCE_EXTENT,
-	}};
-
-	return nonce_add(nonce, crc.nonce << 9);
-}
-
-static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key)
-{
-	return le64_to_cpu(key->magic) != BCH_KEY_MAGIC;
-}
-
-static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb)
-{
-	__le64 magic = __bch2_sb_magic(sb);
-
-	return (struct nonce) {{
-		[0] = 0,
-		[1] = 0,
-		[2] = ((__le32 *) &magic)[0],
-		[3] = ((__le32 *) &magic)[1],
-	}};
-}
-
-static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c)
-{
-	__le64 magic = bch2_sb_magic(c);
-
-	return (struct nonce) {{
-		[0] = 0,
-		[1] = 0,
-		[2] = ((__le32 *) &magic)[0],
-		[3] = ((__le32 *) &magic)[1],
-	}};
-}
-
-#endif /* _BCACHEFS_CHECKSUM_H */
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
deleted file mode 100644
index 363644451106..000000000000
--- a/fs/bcachefs/clock.c
+++ /dev/null
@@ -1,193 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "clock.h"
-
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/preempt.h>
-
-static inline long io_timer_cmp(io_timer_heap *h,
-				struct io_timer *l,
-				struct io_timer *r)
-{
-	return l->expire - r->expire;
-}
-
-void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
-{
-	size_t i;
-
-	spin_lock(&clock->timer_lock);
-
-	if (time_after_eq((unsigned long) atomic64_read(&clock->now),
-			  timer->expire)) {
-		spin_unlock(&clock->timer_lock);
-		timer->fn(timer);
-		return;
-	}
-
-	for (i = 0; i < clock->timers.used; i++)
-		if (clock->timers.data[i] == timer)
-			goto out;
-
-	BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL));
-out:
-	spin_unlock(&clock->timer_lock);
-}
-
-void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
-{
-	size_t i;
-
-	spin_lock(&clock->timer_lock);
-
-	for (i = 0; i < clock->timers.used; i++)
-		if (clock->timers.data[i] == timer) {
-			heap_del(&clock->timers, i, io_timer_cmp, NULL);
-			break;
-		}
-
-	spin_unlock(&clock->timer_lock);
-}
-
-struct io_clock_wait {
-	struct io_timer		io_timer;
-	struct timer_list	cpu_timer;
-	struct task_struct	*task;
-	int			expired;
-};
-
-static void io_clock_wait_fn(struct io_timer *timer)
-{
-	struct io_clock_wait *wait = container_of(timer,
-				struct io_clock_wait, io_timer);
-
-	wait->expired = 1;
-	wake_up_process(wait->task);
-}
-
-static void io_clock_cpu_timeout(struct timer_list *timer)
-{
-	struct io_clock_wait *wait = container_of(timer,
-				struct io_clock_wait, cpu_timer);
-
-	wait->expired = 1;
-	wake_up_process(wait->task);
-}
-
-void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until)
-{
-	struct io_clock_wait wait;
-
-	/* XXX: calculate sleep time rigorously */
-	wait.io_timer.expire	= until;
-	wait.io_timer.fn	= io_clock_wait_fn;
-	wait.task		= current;
-	wait.expired		= 0;
-	bch2_io_timer_add(clock, &wait.io_timer);
-
-	schedule();
-
-	bch2_io_timer_del(clock, &wait.io_timer);
-}
-
-void bch2_kthread_io_clock_wait(struct io_clock *clock,
-				unsigned long io_until,
-				unsigned long cpu_timeout)
-{
-	bool kthread = (current->flags & PF_KTHREAD) != 0;
-	struct io_clock_wait wait;
-
-	wait.io_timer.expire	= io_until;
-	wait.io_timer.fn	= io_clock_wait_fn;
-	wait.task		= current;
-	wait.expired		= 0;
-	bch2_io_timer_add(clock, &wait.io_timer);
-
-	timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
-
-	if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
-		mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
-
-	do {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (kthread && kthread_should_stop())
-			break;
-
-		if (wait.expired)
-			break;
-
-		schedule();
-		try_to_freeze();
-	} while (0);
-
-	__set_current_state(TASK_RUNNING);
-	del_timer_sync(&wait.cpu_timer);
-	destroy_timer_on_stack(&wait.cpu_timer);
-	bch2_io_timer_del(clock, &wait.io_timer);
-}
-
-static struct io_timer *get_expired_timer(struct io_clock *clock,
-					  unsigned long now)
-{
-	struct io_timer *ret = NULL;
-
-	spin_lock(&clock->timer_lock);
-
-	if (clock->timers.used &&
-	    time_after_eq(now, clock->timers.data[0]->expire))
-		heap_pop(&clock->timers, ret, io_timer_cmp, NULL);
-
-	spin_unlock(&clock->timer_lock);
-
-	return ret;
-}
-
-void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
-{
-	struct io_timer *timer;
-	unsigned long now = atomic64_add_return(sectors, &clock->now);
-
-	while ((timer = get_expired_timer(clock, now)))
-		timer->fn(timer);
-}
-
-void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
-{
-	unsigned long now;
-	unsigned i;
-
-	out->atomic++;
-	spin_lock(&clock->timer_lock);
-	now = atomic64_read(&clock->now);
-
-	for (i = 0; i < clock->timers.used; i++)
-		prt_printf(out, "%ps:\t%li\n",
-		       clock->timers.data[i]->fn,
-		       clock->timers.data[i]->expire - now);
-	spin_unlock(&clock->timer_lock);
-	--out->atomic;
-}
-
-void bch2_io_clock_exit(struct io_clock *clock)
-{
-	free_heap(&clock->timers);
-	free_percpu(clock->pcpu_buf);
-}
-
-int bch2_io_clock_init(struct io_clock *clock)
-{
-	atomic64_set(&clock->now, 0);
-	spin_lock_init(&clock->timer_lock);
-
-	clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
-
-	clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
-	if (!clock->pcpu_buf)
-		return -BCH_ERR_ENOMEM_io_clock_init;
-
-	if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL))
-		return -BCH_ERR_ENOMEM_io_clock_init;
-
-	return 0;
-}
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
deleted file mode 100644
index 70a0f7436c84..000000000000
--- a/fs/bcachefs/clock.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_CLOCK_H
-#define _BCACHEFS_CLOCK_H
-
-void bch2_io_timer_add(struct io_clock *, struct io_timer *);
-void bch2_io_timer_del(struct io_clock *, struct io_timer *);
-void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
-				unsigned long);
-
-void __bch2_increment_clock(struct io_clock *, unsigned);
-
-static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors,
-					int rw)
-{
-	struct io_clock *clock = &c->io_clock[rw];
-
-	if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >=
-		   IO_CLOCK_PCPU_SECTORS))
-		__bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0));
-}
-
-void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
-
-#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
-({									\
-	long __ret = timeout;						\
-	might_sleep();							\
-	if (!___wait_cond_timeout(condition))				\
-		__ret = __wait_event_timeout(wq, condition, timeout);	\
-	__ret;								\
-})
-
-void bch2_io_timers_to_text(struct printbuf *, struct io_clock *);
-
-void bch2_io_clock_exit(struct io_clock *);
-int bch2_io_clock_init(struct io_clock *);
-
-#endif /* _BCACHEFS_CLOCK_H */
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
deleted file mode 100644
index 5fae0012d808..000000000000
--- a/fs/bcachefs/clock_types.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_CLOCK_TYPES_H
-#define _BCACHEFS_CLOCK_TYPES_H
-
-#include "util.h"
-
-#define NR_IO_TIMERS		(BCH_SB_MEMBERS_MAX * 3)
-
-/*
- * Clocks/timers in units of sectors of IO:
- *
- * Note - they use percpu batching, so they're only approximate.
- */
-
-struct io_timer;
-typedef void (*io_timer_fn)(struct io_timer *);
-
-struct io_timer {
-	io_timer_fn		fn;
-	unsigned long		expire;
-};
-
-/* Amount to buffer up on a percpu counter */
-#define IO_CLOCK_PCPU_SECTORS	128
-
-typedef HEAP(struct io_timer *)	io_timer_heap;
-
-struct io_clock {
-	atomic64_t		now;
-	u16 __percpu		*pcpu_buf;
-	unsigned		max_slop;
-
-	spinlock_t		timer_lock;
-	io_timer_heap		timers;
-};
-
-#endif /* _BCACHEFS_CLOCK_TYPES_H */
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
deleted file mode 100644
index 1410365a8891..000000000000
--- a/fs/bcachefs/compress.c
+++ /dev/null
@@ -1,728 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "checksum.h"
-#include "compress.h"
-#include "extents.h"
-#include "super-io.h"
-
-#include <linux/lz4.h>
-#include <linux/zlib.h>
-#include <linux/zstd.h>
-
-/* Bounce buffer: */
-struct bbuf {
-	void		*b;
-	enum {
-		BB_NONE,
-		BB_VMAP,
-		BB_KMALLOC,
-		BB_MEMPOOL,
-	}		type;
-	int		rw;
-};
-
-static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
-{
-	void *b;
-
-	BUG_ON(size > c->opts.encoded_extent_max);
-
-	b = kmalloc(size, GFP_NOFS|__GFP_NOWARN);
-	if (b)
-		return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw };
-
-	b = mempool_alloc(&c->compression_bounce[rw], GFP_NOFS);
-	if (b)
-		return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
-
-	BUG();
-}
-
-static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
-{
-	struct bio_vec bv;
-	struct bvec_iter iter;
-	void *expected_start = NULL;
-
-	__bio_for_each_bvec(bv, bio, iter, start) {
-		if (expected_start &&
-		    expected_start != page_address(bv.bv_page) + bv.bv_offset)
-			return false;
-
-		expected_start = page_address(bv.bv_page) +
-			bv.bv_offset + bv.bv_len;
-	}
-
-	return true;
-}
-
-static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
-				       struct bvec_iter start, int rw)
-{
-	struct bbuf ret;
-	struct bio_vec bv;
-	struct bvec_iter iter;
-	unsigned nr_pages = 0;
-	struct page *stack_pages[16];
-	struct page **pages = NULL;
-	void *data;
-
-	BUG_ON(start.bi_size > c->opts.encoded_extent_max);
-
-	if (!PageHighMem(bio_iter_page(bio, start)) &&
-	    bio_phys_contig(bio, start))
-		return (struct bbuf) {
-			.b = page_address(bio_iter_page(bio, start)) +
-				bio_iter_offset(bio, start),
-			.type = BB_NONE, .rw = rw
-		};
-
-	/* check if we can map the pages contiguously: */
-	__bio_for_each_segment(bv, bio, iter, start) {
-		if (iter.bi_size != start.bi_size &&
-		    bv.bv_offset)
-			goto bounce;
-
-		if (bv.bv_len < iter.bi_size &&
-		    bv.bv_offset + bv.bv_len < PAGE_SIZE)
-			goto bounce;
-
-		nr_pages++;
-	}
-
-	BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages);
-
-	pages = nr_pages > ARRAY_SIZE(stack_pages)
-		? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS)
-		: stack_pages;
-	if (!pages)
-		goto bounce;
-
-	nr_pages = 0;
-	__bio_for_each_segment(bv, bio, iter, start)
-		pages[nr_pages++] = bv.bv_page;
-
-	data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
-	if (pages != stack_pages)
-		kfree(pages);
-
-	if (data)
-		return (struct bbuf) {
-			.b = data + bio_iter_offset(bio, start),
-			.type = BB_VMAP, .rw = rw
-		};
-bounce:
-	ret = __bounce_alloc(c, start.bi_size, rw);
-
-	if (rw == READ)
-		memcpy_from_bio(ret.b, bio, start);
-
-	return ret;
-}
-
-static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw)
-{
-	return __bio_map_or_bounce(c, bio, bio->bi_iter, rw);
-}
-
-static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf)
-{
-	switch (buf.type) {
-	case BB_NONE:
-		break;
-	case BB_VMAP:
-		vunmap((void *) ((unsigned long) buf.b & PAGE_MASK));
-		break;
-	case BB_KMALLOC:
-		kfree(buf.b);
-		break;
-	case BB_MEMPOOL:
-		mempool_free(buf.b, &c->compression_bounce[buf.rw]);
-		break;
-	}
-}
-
-static inline void zlib_set_workspace(z_stream *strm, void *workspace)
-{
-#ifdef __KERNEL__
-	strm->workspace = workspace;
-#endif
-}
-
-static int __bio_uncompress(struct bch_fs *c, struct bio *src,
-			    void *dst_data, struct bch_extent_crc_unpacked crc)
-{
-	struct bbuf src_data = { NULL };
-	size_t src_len = src->bi_iter.bi_size;
-	size_t dst_len = crc.uncompressed_size << 9;
-	void *workspace;
-	int ret;
-
-	src_data = bio_map_or_bounce(c, src, READ);
-
-	switch (crc.compression_type) {
-	case BCH_COMPRESSION_TYPE_lz4_old:
-	case BCH_COMPRESSION_TYPE_lz4:
-		ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
-						  src_len, dst_len, dst_len);
-		if (ret != dst_len)
-			goto err;
-		break;
-	case BCH_COMPRESSION_TYPE_gzip: {
-		z_stream strm = {
-			.next_in	= src_data.b,
-			.avail_in	= src_len,
-			.next_out	= dst_data,
-			.avail_out	= dst_len,
-		};
-
-		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS);
-
-		zlib_set_workspace(&strm, workspace);
-		zlib_inflateInit2(&strm, -MAX_WBITS);
-		ret = zlib_inflate(&strm, Z_FINISH);
-
-		mempool_free(workspace, &c->decompress_workspace);
-
-		if (ret != Z_STREAM_END)
-			goto err;
-		break;
-	}
-	case BCH_COMPRESSION_TYPE_zstd: {
-		ZSTD_DCtx *ctx;
-		size_t real_src_len = le32_to_cpup(src_data.b);
-
-		if (real_src_len > src_len - 4)
-			goto err;
-
-		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS);
-		ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
-
-		ret = zstd_decompress_dctx(ctx,
-				dst_data,	dst_len,
-				src_data.b + 4, real_src_len);
-
-		mempool_free(workspace, &c->decompress_workspace);
-
-		if (ret != dst_len)
-			goto err;
-		break;
-	}
-	default:
-		BUG();
-	}
-	ret = 0;
-out:
-	bio_unmap_or_unbounce(c, src_data);
-	return ret;
-err:
-	ret = -EIO;
-	goto out;
-}
-
-int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
-				struct bch_extent_crc_unpacked *crc)
-{
-	struct bbuf data = { NULL };
-	size_t dst_len = crc->uncompressed_size << 9;
-
-	/* bio must own its pages: */
-	BUG_ON(!bio->bi_vcnt);
-	BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
-
-	if (crc->uncompressed_size << 9	> c->opts.encoded_extent_max ||
-	    crc->compressed_size << 9	> c->opts.encoded_extent_max) {
-		bch_err(c, "error rewriting existing data: extent too big");
-		return -EIO;
-	}
-
-	data = __bounce_alloc(c, dst_len, WRITE);
-
-	if (__bio_uncompress(c, bio, data.b, *crc)) {
-		if (!c->opts.no_data_io)
-			bch_err(c, "error rewriting existing data: decompression error");
-		bio_unmap_or_unbounce(c, data);
-		return -EIO;
-	}
-
-	/*
-	 * XXX: don't have a good way to assert that the bio was allocated with
-	 * enough space, we depend on bch2_move_extent doing the right thing
-	 */
-	bio->bi_iter.bi_size = crc->live_size << 9;
-
-	memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
-
-	crc->csum_type		= 0;
-	crc->compression_type	= 0;
-	crc->compressed_size	= crc->live_size;
-	crc->uncompressed_size	= crc->live_size;
-	crc->offset		= 0;
-	crc->csum		= (struct bch_csum) { 0, 0 };
-
-	bio_unmap_or_unbounce(c, data);
-	return 0;
-}
-
-int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
-		       struct bio *dst, struct bvec_iter dst_iter,
-		       struct bch_extent_crc_unpacked crc)
-{
-	struct bbuf dst_data = { NULL };
-	size_t dst_len = crc.uncompressed_size << 9;
-	int ret;
-
-	if (crc.uncompressed_size << 9	> c->opts.encoded_extent_max ||
-	    crc.compressed_size << 9	> c->opts.encoded_extent_max)
-		return -EIO;
-
-	dst_data = dst_len == dst_iter.bi_size
-		? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
-		: __bounce_alloc(c, dst_len, WRITE);
-
-	ret = __bio_uncompress(c, src, dst_data.b, crc);
-	if (ret)
-		goto err;
-
-	if (dst_data.type != BB_NONE &&
-	    dst_data.type != BB_VMAP)
-		memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9));
-err:
-	bio_unmap_or_unbounce(c, dst_data);
-	return ret;
-}
-
-static int attempt_compress(struct bch_fs *c,
-			    void *workspace,
-			    void *dst, size_t dst_len,
-			    void *src, size_t src_len,
-			    struct bch_compression_opt compression)
-{
-	enum bch_compression_type compression_type =
-		__bch2_compression_opt_to_type[compression.type];
-
-	switch (compression_type) {
-	case BCH_COMPRESSION_TYPE_lz4:
-		if (compression.level < LZ4HC_MIN_CLEVEL) {
-			int len = src_len;
-			int ret = LZ4_compress_destSize(
-					src,		dst,
-					&len,		dst_len,
-					workspace);
-			if (len < src_len)
-				return -len;
-
-			return ret;
-		} else {
-			int ret = LZ4_compress_HC(
-					src,		dst,
-					src_len,	dst_len,
-					compression.level,
-					workspace);
-
-			return ret ?: -1;
-		}
-	case BCH_COMPRESSION_TYPE_gzip: {
-		z_stream strm = {
-			.next_in	= src,
-			.avail_in	= src_len,
-			.next_out	= dst,
-			.avail_out	= dst_len,
-		};
-
-		zlib_set_workspace(&strm, workspace);
-		zlib_deflateInit2(&strm,
-				  compression.level
-				  ? clamp_t(unsigned, compression.level,
-					    Z_BEST_SPEED, Z_BEST_COMPRESSION)
-				  : Z_DEFAULT_COMPRESSION,
-				  Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
-				  Z_DEFAULT_STRATEGY);
-
-		if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END)
-			return 0;
-
-		if (zlib_deflateEnd(&strm) != Z_OK)
-			return 0;
-
-		return strm.total_out;
-	}
-	case BCH_COMPRESSION_TYPE_zstd: {
-		/*
-		 * rescale:
-		 * zstd max compression level is 22, our max level is 15
-		 */
-		unsigned level = min((compression.level * 3) / 2, zstd_max_clevel());
-		ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max);
-		ZSTD_CCtx *ctx = zstd_init_cctx(workspace, c->zstd_workspace_size);
-
-		/*
-		 * ZSTD requires that when we decompress we pass in the exact
-		 * compressed size - rounding it up to the nearest sector
-		 * doesn't work, so we use the first 4 bytes of the buffer for
-		 * that.
-		 *
-		 * Additionally, the ZSTD code seems to have a bug where it will
-		 * write just past the end of the buffer - so subtract a fudge
-		 * factor (7 bytes) from the dst buffer size to account for
-		 * that.
-		 */
-		size_t len = zstd_compress_cctx(ctx,
-				dst + 4,	dst_len - 4 - 7,
-				src,		src_len,
-				&params);
-		if (zstd_is_error(len))
-			return 0;
-
-		*((__le32 *) dst) = cpu_to_le32(len);
-		return len + 4;
-	}
-	default:
-		BUG();
-	}
-}
-
-static unsigned __bio_compress(struct bch_fs *c,
-			       struct bio *dst, size_t *dst_len,
-			       struct bio *src, size_t *src_len,
-			       struct bch_compression_opt compression)
-{
-	struct bbuf src_data = { NULL }, dst_data = { NULL };
-	void *workspace;
-	enum bch_compression_type compression_type =
-		__bch2_compression_opt_to_type[compression.type];
-	unsigned pad;
-	int ret = 0;
-
-	BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR);
-	BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
-
-	/* If it's only one block, don't bother trying to compress: */
-	if (src->bi_iter.bi_size <= c->opts.block_size)
-		return BCH_COMPRESSION_TYPE_incompressible;
-
-	dst_data = bio_map_or_bounce(c, dst, WRITE);
-	src_data = bio_map_or_bounce(c, src, READ);
-
-	workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOFS);
-
-	*src_len = src->bi_iter.bi_size;
-	*dst_len = dst->bi_iter.bi_size;
-
-	/*
-	 * XXX: this algorithm sucks when the compression code doesn't tell us
-	 * how much would fit, like LZ4 does:
-	 */
-	while (1) {
-		if (*src_len <= block_bytes(c)) {
-			ret = -1;
-			break;
-		}
-
-		ret = attempt_compress(c, workspace,
-				       dst_data.b,	*dst_len,
-				       src_data.b,	*src_len,
-				       compression);
-		if (ret > 0) {
-			*dst_len = ret;
-			ret = 0;
-			break;
-		}
-
-		/* Didn't fit: should we retry with a smaller amount?  */
-		if (*src_len <= *dst_len) {
-			ret = -1;
-			break;
-		}
-
-		/*
-		 * If ret is negative, it's a hint as to how much data would fit
-		 */
-		BUG_ON(-ret >= *src_len);
-
-		if (ret < 0)
-			*src_len = -ret;
-		else
-			*src_len -= (*src_len - *dst_len) / 2;
-		*src_len = round_down(*src_len, block_bytes(c));
-	}
-
-	mempool_free(workspace, &c->compress_workspace[compression_type]);
-
-	if (ret)
-		goto err;
-
-	/* Didn't get smaller: */
-	if (round_up(*dst_len, block_bytes(c)) >= *src_len)
-		goto err;
-
-	pad = round_up(*dst_len, block_bytes(c)) - *dst_len;
-
-	memset(dst_data.b + *dst_len, 0, pad);
-	*dst_len += pad;
-
-	if (dst_data.type != BB_NONE &&
-	    dst_data.type != BB_VMAP)
-		memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
-
-	BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
-	BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
-	BUG_ON(*dst_len & (block_bytes(c) - 1));
-	BUG_ON(*src_len & (block_bytes(c) - 1));
-	ret = compression_type;
-out:
-	bio_unmap_or_unbounce(c, src_data);
-	bio_unmap_or_unbounce(c, dst_data);
-	return ret;
-err:
-	ret = BCH_COMPRESSION_TYPE_incompressible;
-	goto out;
-}
-
-unsigned bch2_bio_compress(struct bch_fs *c,
-			   struct bio *dst, size_t *dst_len,
-			   struct bio *src, size_t *src_len,
-			   unsigned compression_opt)
-{
-	unsigned orig_dst = dst->bi_iter.bi_size;
-	unsigned orig_src = src->bi_iter.bi_size;
-	unsigned compression_type;
-
-	/* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
-	src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
-				     c->opts.encoded_extent_max);
-	/* Don't generate a bigger output than input: */
-	dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
-
-	compression_type =
-		__bio_compress(c, dst, dst_len, src, src_len,
-			       bch2_compression_decode(compression_opt));
-
-	dst->bi_iter.bi_size = orig_dst;
-	src->bi_iter.bi_size = orig_src;
-	return compression_type;
-}
-
-static int __bch2_fs_compress_init(struct bch_fs *, u64);
-
-#define BCH_FEATURE_none	0
-
-static const unsigned bch2_compression_opt_to_feature[] = {
-#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
-	BCH_COMPRESSION_OPTS()
-#undef x
-};
-
-#undef BCH_FEATURE_none
-
-static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
-{
-	int ret = 0;
-
-	if ((c->sb.features & f) == f)
-		return 0;
-
-	mutex_lock(&c->sb_lock);
-
-	if ((c->sb.features & f) == f) {
-		mutex_unlock(&c->sb_lock);
-		return 0;
-	}
-
-	ret = __bch2_fs_compress_init(c, c->sb.features|f);
-	if (ret) {
-		mutex_unlock(&c->sb_lock);
-		return ret;
-	}
-
-	c->disk_sb.sb->features[0] |= cpu_to_le64(f);
-	bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-
-	return 0;
-}
-
-int bch2_check_set_has_compressed_data(struct bch_fs *c,
-				       unsigned compression_opt)
-{
-	unsigned compression_type = bch2_compression_decode(compression_opt).type;
-
-	BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature));
-
-	return compression_type
-		? __bch2_check_set_has_compressed_data(c,
-				1ULL << bch2_compression_opt_to_feature[compression_type])
-		: 0;
-}
-
-void bch2_fs_compress_exit(struct bch_fs *c)
-{
-	unsigned i;
-
-	mempool_exit(&c->decompress_workspace);
-	for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
-		mempool_exit(&c->compress_workspace[i]);
-	mempool_exit(&c->compression_bounce[WRITE]);
-	mempool_exit(&c->compression_bounce[READ]);
-}
-
-static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
-{
-	size_t decompress_workspace_size = 0;
-	ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
-						 c->opts.encoded_extent_max);
-
-	c->zstd_workspace_size = zstd_cctx_workspace_bound(&params.cParams);
-
-	struct {
-		unsigned			feature;
-		enum bch_compression_type	type;
-		size_t				compress_workspace;
-		size_t				decompress_workspace;
-	} compression_types[] = {
-		{ BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4,
-			max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS),
-			0 },
-		{ BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip,
-			zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
-			zlib_inflate_workspacesize(), },
-		{ BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd,
-			c->zstd_workspace_size,
-			zstd_dctx_workspace_bound() },
-	}, *i;
-	bool have_compressed = false;
-
-	for (i = compression_types;
-	     i < compression_types + ARRAY_SIZE(compression_types);
-	     i++)
-		have_compressed |= (features & (1 << i->feature)) != 0;
-
-	if (!have_compressed)
-		return 0;
-
-	if (!mempool_initialized(&c->compression_bounce[READ]) &&
-	    mempool_init_kvmalloc_pool(&c->compression_bounce[READ],
-				       1, c->opts.encoded_extent_max))
-		return -BCH_ERR_ENOMEM_compression_bounce_read_init;
-
-	if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
-	    mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE],
-				       1, c->opts.encoded_extent_max))
-		return -BCH_ERR_ENOMEM_compression_bounce_write_init;
-
-	for (i = compression_types;
-	     i < compression_types + ARRAY_SIZE(compression_types);
-	     i++) {
-		decompress_workspace_size =
-			max(decompress_workspace_size, i->decompress_workspace);
-
-		if (!(features & (1 << i->feature)))
-			continue;
-
-		if (mempool_initialized(&c->compress_workspace[i->type]))
-			continue;
-
-		if (mempool_init_kvmalloc_pool(
-				&c->compress_workspace[i->type],
-				1, i->compress_workspace))
-			return -BCH_ERR_ENOMEM_compression_workspace_init;
-	}
-
-	if (!mempool_initialized(&c->decompress_workspace) &&
-	    mempool_init_kvmalloc_pool(&c->decompress_workspace,
-				       1, decompress_workspace_size))
-		return -BCH_ERR_ENOMEM_decompression_workspace_init;
-
-	return 0;
-}
-
-static u64 compression_opt_to_feature(unsigned v)
-{
-	unsigned type = bch2_compression_decode(v).type;
-
-	return BIT_ULL(bch2_compression_opt_to_feature[type]);
-}
-
-int bch2_fs_compress_init(struct bch_fs *c)
-{
-	u64 f = c->sb.features;
-
-	f |= compression_opt_to_feature(c->opts.compression);
-	f |= compression_opt_to_feature(c->opts.background_compression);
-
-	return __bch2_fs_compress_init(c, f);
-}
-
-int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res,
-			       struct printbuf *err)
-{
-	char *val = kstrdup(_val, GFP_KERNEL);
-	char *p = val, *type_str, *level_str;
-	struct bch_compression_opt opt = { 0 };
-	int ret;
-
-	if (!val)
-		return -ENOMEM;
-
-	type_str = strsep(&p, ":");
-	level_str = p;
-
-	ret = match_string(bch2_compression_opts, -1, type_str);
-	if (ret < 0 && err)
-		prt_str(err, "invalid compression type");
-	if (ret < 0)
-		goto err;
-
-	opt.type = ret;
-
-	if (level_str) {
-		unsigned level;
-
-		ret = kstrtouint(level_str, 10, &level);
-		if (!ret && !opt.type && level)
-			ret = -EINVAL;
-		if (!ret && level > 15)
-			ret = -EINVAL;
-		if (ret < 0 && err)
-			prt_str(err, "invalid compression level");
-		if (ret < 0)
-			goto err;
-
-		opt.level = level;
-	}
-
-	*res = bch2_compression_encode(opt);
-err:
-	kfree(val);
-	return ret;
-}
-
-void bch2_compression_opt_to_text(struct printbuf *out, u64 v)
-{
-	struct bch_compression_opt opt = bch2_compression_decode(v);
-
-	if (opt.type < BCH_COMPRESSION_OPT_NR)
-		prt_str(out, bch2_compression_opts[opt.type]);
-	else
-		prt_printf(out, "(unknown compression opt %u)", opt.type);
-	if (opt.level)
-		prt_printf(out, ":%u", opt.level);
-}
-
-void bch2_opt_compression_to_text(struct printbuf *out,
-				  struct bch_fs *c,
-				  struct bch_sb *sb,
-				  u64 v)
-{
-	return bch2_compression_opt_to_text(out, v);
-}
-
-int bch2_opt_compression_validate(u64 v, struct printbuf *err)
-{
-	if (!bch2_compression_opt_valid(v)) {
-		prt_printf(err, "invalid compression opt %llu", v);
-		return -BCH_ERR_invalid_sb_opt_compression;
-	}
-
-	return 0;
-}
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
deleted file mode 100644
index 607fd5e232c9..000000000000
--- a/fs/bcachefs/compress.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_COMPRESS_H
-#define _BCACHEFS_COMPRESS_H
-
-#include "extents_types.h"
-
-static const unsigned __bch2_compression_opt_to_type[] = {
-#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
-	BCH_COMPRESSION_OPTS()
-#undef x
-};
-
-struct bch_compression_opt {
-	u8		type:4,
-			level:4;
-};
-
-static inline struct bch_compression_opt __bch2_compression_decode(unsigned v)
-{
-	return (struct bch_compression_opt) {
-		.type	= v & 15,
-		.level	= v >> 4,
-	};
-}
-
-static inline bool bch2_compression_opt_valid(unsigned v)
-{
-	struct bch_compression_opt opt = __bch2_compression_decode(v);
-
-	return opt.type < ARRAY_SIZE(__bch2_compression_opt_to_type) && !(!opt.type && opt.level);
-}
-
-static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
-{
-	return bch2_compression_opt_valid(v)
-		? __bch2_compression_decode(v)
-		: (struct bch_compression_opt) { 0 };
-}
-
-static inline unsigned bch2_compression_encode(struct bch_compression_opt opt)
-{
-	return opt.type|(opt.level << 4);
-}
-
-static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
-{
-	return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
-}
-
-int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
-				struct bch_extent_crc_unpacked *);
-int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
-		       struct bvec_iter, struct bch_extent_crc_unpacked);
-unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
-			   struct bio *, size_t *, unsigned);
-
-int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
-void bch2_fs_compress_exit(struct bch_fs *);
-int bch2_fs_compress_init(struct bch_fs *);
-
-void bch2_compression_opt_to_text(struct printbuf *, u64);
-
-int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
-void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
-int bch2_opt_compression_validate(u64, struct printbuf *);
-
-#define bch2_opt_compression (struct bch_opt_fn) {		\
-	.parse		= bch2_opt_compression_parse,		\
-	.to_text	= bch2_opt_compression_to_text,		\
-	.validate	= bch2_opt_compression_validate,	\
-}
-
-#endif /* _BCACHEFS_COMPRESS_H */
diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c
deleted file mode 100644
index ac35b8b705ae..000000000000
--- a/fs/bcachefs/darray.c
+++ /dev/null
@@ -1,24 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/log2.h>
-#include <linux/slab.h>
-#include "darray.h"
-
-int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
-{
-	if (new_size > d->size) {
-		new_size = roundup_pow_of_two(new_size);
-
-		void *data = kvmalloc_array(new_size, element_size, gfp);
-		if (!data)
-			return -ENOMEM;
-
-		memcpy(data, d->data, d->size * element_size);
-		if (d->data != d->preallocated)
-			kvfree(d->data);
-		d->data	= data;
-		d->size = new_size;
-	}
-
-	return 0;
-}
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
deleted file mode 100644
index 4b340d13caac..000000000000
--- a/fs/bcachefs/darray.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DARRAY_H
-#define _BCACHEFS_DARRAY_H
-
-/*
- * Dynamic arrays:
- *
- * Inspired by CCAN's darray
- */
-
-#include <linux/slab.h>
-
-#define DARRAY_PREALLOCATED(_type, _nr)					\
-struct {								\
-	size_t nr, size;						\
-	_type *data;							\
-	_type preallocated[_nr];					\
-}
-
-#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
-
-typedef DARRAY(char)	darray_char;
-typedef DARRAY(char *) darray_str;
-
-int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t);
-
-static inline int __darray_resize(darray_char *d, size_t element_size,
-				  size_t new_size, gfp_t gfp)
-{
-	return unlikely(new_size > d->size)
-		? __bch2_darray_resize(d, element_size, new_size, gfp)
-		: 0;
-}
-
-#define darray_resize_gfp(_d, _new_size, _gfp)				\
-	unlikely(__darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp))
-
-#define darray_resize(_d, _new_size)					\
-	darray_resize_gfp(_d, _new_size, GFP_KERNEL)
-
-static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more, gfp_t gfp)
-{
-	return __darray_resize(d, t_size, d->nr + more, gfp);
-}
-
-#define darray_make_room_gfp(_d, _more, _gfp)				\
-	__darray_make_room((darray_char *) (_d), sizeof((_d)->data[0]), (_more), _gfp)
-
-#define darray_make_room(_d, _more)					\
-	darray_make_room_gfp(_d, _more, GFP_KERNEL)
-
-#define darray_room(_d)		((_d).size - (_d).nr)
-
-#define darray_top(_d)		((_d).data[(_d).nr])
-
-#define darray_push_gfp(_d, _item, _gfp)				\
-({									\
-	int _ret = darray_make_room_gfp((_d), 1, _gfp);			\
-									\
-	if (!_ret)							\
-		(_d)->data[(_d)->nr++] = (_item);			\
-	_ret;								\
-})
-
-#define darray_push(_d, _item)	darray_push_gfp(_d, _item, GFP_KERNEL)
-
-#define darray_pop(_d)		((_d)->data[--(_d)->nr])
-
-#define darray_first(_d)	((_d).data[0])
-#define darray_last(_d)		((_d).data[(_d).nr - 1])
-
-#define darray_insert_item(_d, pos, _item)				\
-({									\
-	size_t _pos = (pos);						\
-	int _ret = darray_make_room((_d), 1);				\
-									\
-	if (!_ret)							\
-		array_insert_item((_d)->data, (_d)->nr, _pos, (_item));	\
-	_ret;								\
-})
-
-#define darray_remove_item(_d, _pos)					\
-	array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
-
-#define __darray_for_each(_d, _i)						\
-	for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++)
-
-#define darray_for_each(_d, _i)						\
-	for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++)
-
-#define darray_for_each_reverse(_d, _i)					\
-	for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i)
-
-#define darray_init(_d)							\
-do {									\
-	(_d)->nr = 0;							\
-	(_d)->size = ARRAY_SIZE((_d)->preallocated);			\
-	(_d)->data = (_d)->size ? (_d)->preallocated : NULL;		\
-} while (0)
-
-#define darray_exit(_d)							\
-do {									\
-	if (!ARRAY_SIZE((_d)->preallocated) ||				\
-	    (_d)->data != (_d)->preallocated)				\
-		kvfree((_d)->data);					\
-	darray_init(_d);						\
-} while (0)
-
-#endif /* _BCACHEFS_DARRAY_H */
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
deleted file mode 100644
index 0d807c2ce9c6..000000000000
--- a/fs/bcachefs/data_update.c
+++ /dev/null
@@ -1,696 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "data_update.h"
-#include "ec.h"
-#include "error.h"
-#include "extents.h"
-#include "io_write.h"
-#include "keylist.h"
-#include "move.h"
-#include "nocow_locking.h"
-#include "rebalance.h"
-#include "snapshot.h"
-#include "subvolume.h"
-#include "trace.h"
-
-static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k)
-{
-	if (trace_move_extent_finish_enabled()) {
-		struct printbuf buf = PRINTBUF;
-
-		bch2_bkey_val_to_text(&buf, c, k);
-		trace_move_extent_finish(c, buf.buf);
-		printbuf_exit(&buf);
-	}
-}
-
-static void trace_move_extent_fail2(struct data_update *m,
-			 struct bkey_s_c new,
-			 struct bkey_s_c wrote,
-			 struct bkey_i *insert,
-			 const char *msg)
-{
-	struct bch_fs *c = m->op.c;
-	struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
-	const union bch_extent_entry *entry;
-	struct bch_extent_ptr *ptr;
-	struct extent_ptr_decoded p;
-	struct printbuf buf = PRINTBUF;
-	unsigned i, rewrites_found = 0;
-
-	if (!trace_move_extent_fail_enabled())
-		return;
-
-	prt_str(&buf, msg);
-
-	if (insert) {
-		i = 0;
-		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
-			if (((1U << i) & m->data_opts.rewrite_ptrs) &&
-			    (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
-			    !ptr->cached)
-				rewrites_found |= 1U << i;
-			i++;
-		}
-	}
-
-	prt_printf(&buf, "\nrewrite ptrs:   %u%u%u%u",
-		   (m->data_opts.rewrite_ptrs & (1 << 0)) != 0,
-		   (m->data_opts.rewrite_ptrs & (1 << 1)) != 0,
-		   (m->data_opts.rewrite_ptrs & (1 << 2)) != 0,
-		   (m->data_opts.rewrite_ptrs & (1 << 3)) != 0);
-
-	prt_printf(&buf, "\nrewrites found: %u%u%u%u",
-		   (rewrites_found & (1 << 0)) != 0,
-		   (rewrites_found & (1 << 1)) != 0,
-		   (rewrites_found & (1 << 2)) != 0,
-		   (rewrites_found & (1 << 3)) != 0);
-
-	prt_str(&buf, "\nold:    ");
-	bch2_bkey_val_to_text(&buf, c, old);
-
-	prt_str(&buf, "\nnew:    ");
-	bch2_bkey_val_to_text(&buf, c, new);
-
-	prt_str(&buf, "\nwrote:  ");
-	bch2_bkey_val_to_text(&buf, c, wrote);
-
-	if (insert) {
-		prt_str(&buf, "\ninsert: ");
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-	}
-
-	trace_move_extent_fail(c, buf.buf);
-	printbuf_exit(&buf);
-}
-
-static int __bch2_data_update_index_update(struct btree_trans *trans,
-					   struct bch_write_op *op)
-{
-	struct bch_fs *c = op->c;
-	struct btree_iter iter;
-	struct data_update *m =
-		container_of(op, struct data_update, op);
-	struct keylist *keys = &op->insert_keys;
-	struct bkey_buf _new, _insert;
-	int ret = 0;
-
-	bch2_bkey_buf_init(&_new);
-	bch2_bkey_buf_init(&_insert);
-	bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
-
-	bch2_trans_iter_init(trans, &iter, m->btree_id,
-			     bkey_start_pos(&bch2_keylist_front(keys)->k),
-			     BTREE_ITER_slots|BTREE_ITER_intent);
-
-	while (1) {
-		struct bkey_s_c k;
-		struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
-		struct bkey_i *insert = NULL;
-		struct bkey_i_extent *new;
-		const union bch_extent_entry *entry_c;
-		union bch_extent_entry *entry;
-		struct extent_ptr_decoded p;
-		struct bch_extent_ptr *ptr;
-		const struct bch_extent_ptr *ptr_c;
-		struct bpos next_pos;
-		bool should_check_enospc;
-		s64 i_sectors_delta = 0, disk_sectors_delta = 0;
-		unsigned rewrites_found = 0, durability, i;
-
-		bch2_trans_begin(trans);
-
-		k = bch2_btree_iter_peek_slot(&iter);
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
-
-		new = bkey_i_to_extent(bch2_keylist_front(keys));
-
-		if (!bch2_extents_match(k, old)) {
-			trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i),
-						NULL, "no match:");
-			goto nowork;
-		}
-
-		bkey_reassemble(_insert.k, k);
-		insert = _insert.k;
-
-		bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
-		new = bkey_i_to_extent(_new.k);
-		bch2_cut_front(iter.pos, &new->k_i);
-
-		bch2_cut_front(iter.pos,	insert);
-		bch2_cut_back(new->k.p,		insert);
-		bch2_cut_back(insert->k.p,	&new->k_i);
-
-		/*
-		 * @old: extent that we read from
-		 * @insert: key that we're going to update, initialized from
-		 * extent currently in btree - same as @old unless we raced with
-		 * other updates
-		 * @new: extent with new pointers that we'll be adding to @insert
-		 *
-		 * Fist, drop rewrite_ptrs from @new:
-		 */
-		i = 0;
-		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) {
-			if (((1U << i) & m->data_opts.rewrite_ptrs) &&
-			    (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
-			    !ptr->cached) {
-				bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr);
-				rewrites_found |= 1U << i;
-			}
-			i++;
-		}
-
-		if (m->data_opts.rewrite_ptrs &&
-		    !rewrites_found &&
-		    bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) {
-			trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:");
-			goto nowork;
-		}
-
-		/*
-		 * A replica that we just wrote might conflict with a replica
-		 * that we want to keep, due to racing with another move:
-		 */
-restart_drop_conflicting_replicas:
-		extent_for_each_ptr(extent_i_to_s(new), ptr)
-			if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) &&
-			    !ptr_c->cached) {
-				bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr);
-				goto restart_drop_conflicting_replicas;
-			}
-
-		if (!bkey_val_u64s(&new->k)) {
-			trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:");
-			goto nowork;
-		}
-
-		/* Now, drop pointers that conflict with what we just wrote: */
-		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
-			if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev)))
-				bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
-
-		durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) +
-			bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i));
-
-		/* Now, drop excess replicas: */
-restart_drop_extra_replicas:
-
-		rcu_read_lock();
-		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
-			unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
-
-			if (!p.ptr.cached &&
-			    durability - ptr_durability >= m->op.opts.data_replicas) {
-				durability -= ptr_durability;
-
-				bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr);
-				goto restart_drop_extra_replicas;
-			}
-		}
-		rcu_read_unlock();
-
-		/* Finally, add the pointers we just wrote: */
-		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
-			bch2_extent_ptr_decoded_append(insert, &p);
-
-		bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
-		bch2_extent_normalize(c, bkey_i_to_s(insert));
-
-		ret = bch2_sum_sector_overwrites(trans, &iter, insert,
-						 &should_check_enospc,
-						 &i_sectors_delta,
-						 &disk_sectors_delta);
-		if (ret)
-			goto err;
-
-		if (disk_sectors_delta > (s64) op->res.sectors) {
-			ret = bch2_disk_reservation_add(c, &op->res,
-						disk_sectors_delta - op->res.sectors,
-						!should_check_enospc
-						? BCH_DISK_RESERVATION_NOFAIL : 0);
-			if (ret)
-				goto out;
-		}
-
-		next_pos = insert->k.p;
-
-		/*
-		 * Check for nonce offset inconsistency:
-		 * This is debug code - we've been seeing this bug rarely, and
-		 * it's been hard to reproduce, so this should give us some more
-		 * information when it does occur:
-		 */
-		struct printbuf err = PRINTBUF;
-		int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err);
-		printbuf_exit(&err);
-
-		if (invalid) {
-			struct printbuf buf = PRINTBUF;
-
-			prt_str(&buf, "about to insert invalid key in data update path");
-			prt_str(&buf, "\nold: ");
-			bch2_bkey_val_to_text(&buf, c, old);
-			prt_str(&buf, "\nk:   ");
-			bch2_bkey_val_to_text(&buf, c, k);
-			prt_str(&buf, "\nnew: ");
-			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-
-			bch2_print_string_as_lines(KERN_ERR, buf.buf);
-			printbuf_exit(&buf);
-
-			bch2_fatal_error(c);
-			goto out;
-		}
-
-		if (trace_data_update_enabled()) {
-			struct printbuf buf = PRINTBUF;
-
-			prt_str(&buf, "\nold: ");
-			bch2_bkey_val_to_text(&buf, c, old);
-			prt_str(&buf, "\nk:   ");
-			bch2_bkey_val_to_text(&buf, c, k);
-			prt_str(&buf, "\nnew: ");
-			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-
-			trace_data_update(c, buf.buf);
-			printbuf_exit(&buf);
-		}
-
-		ret =   bch2_insert_snapshot_whiteouts(trans, m->btree_id,
-						k.k->p, bkey_start_pos(&insert->k)) ?:
-			bch2_insert_snapshot_whiteouts(trans, m->btree_id,
-						k.k->p, insert->k.p) ?:
-			bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?:
-			bch2_trans_update(trans, &iter, insert,
-				BTREE_UPDATE_internal_snapshot_node) ?:
-			bch2_trans_commit(trans, &op->res,
-				NULL,
-				BCH_TRANS_COMMIT_no_check_rw|
-				BCH_TRANS_COMMIT_no_enospc|
-				m->data_opts.btree_insert_flags);
-		if (!ret) {
-			bch2_btree_iter_set_pos(&iter, next_pos);
-
-			this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size);
-			trace_move_extent_finish2(c, bkey_i_to_s_c(&new->k_i));
-		}
-err:
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			ret = 0;
-		if (ret)
-			break;
-next:
-		while (bkey_ge(iter.pos, bch2_keylist_front(keys)->k.p)) {
-			bch2_keylist_pop_front(keys);
-			if (bch2_keylist_empty(keys))
-				goto out;
-		}
-		continue;
-nowork:
-		if (m->stats) {
-			BUG_ON(k.k->p.offset <= iter.pos.offset);
-			atomic64_inc(&m->stats->keys_raced);
-			atomic64_add(k.k->p.offset - iter.pos.offset,
-				     &m->stats->sectors_raced);
-		}
-
-		count_event(c, move_extent_fail);
-
-		bch2_btree_iter_advance(&iter);
-		goto next;
-	}
-out:
-	bch2_trans_iter_exit(trans, &iter);
-	bch2_bkey_buf_exit(&_insert, c);
-	bch2_bkey_buf_exit(&_new, c);
-	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
-	return ret;
-}
-
-int bch2_data_update_index_update(struct bch_write_op *op)
-{
-	return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op));
-}
-
-void bch2_data_update_read_done(struct data_update *m,
-				struct bch_extent_crc_unpacked crc)
-{
-	/* write bio must own pages: */
-	BUG_ON(!m->op.wbio.bio.bi_vcnt);
-
-	m->op.crc = crc;
-	m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9;
-
-	closure_call(&m->op.cl, bch2_write, NULL, NULL);
-}
-
-void bch2_data_update_exit(struct data_update *update)
-{
-	struct bch_fs *c = update->op.c;
-	struct bkey_ptrs_c ptrs =
-		bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k));
-
-	bkey_for_each_ptr(ptrs, ptr) {
-		struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-		if (c->opts.nocow_enabled)
-			bch2_bucket_nocow_unlock(&c->nocow_locks,
-						 PTR_BUCKET_POS(ca, ptr), 0);
-		bch2_dev_put(ca);
-	}
-
-	bch2_bkey_buf_exit(&update->k, c);
-	bch2_disk_reservation_put(c, &update->op.res);
-	bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
-}
-
-static void bch2_update_unwritten_extent(struct btree_trans *trans,
-				  struct data_update *update)
-{
-	struct bch_fs *c = update->op.c;
-	struct bio *bio = &update->op.wbio.bio;
-	struct bkey_i_extent *e;
-	struct write_point *wp;
-	struct closure cl;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	closure_init_stack(&cl);
-	bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys);
-
-	while (bio_sectors(bio)) {
-		unsigned sectors = bio_sectors(bio);
-
-		bch2_trans_begin(trans);
-
-		bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos,
-				     BTREE_ITER_slots);
-		ret = lockrestart_do(trans, ({
-			k = bch2_btree_iter_peek_slot(&iter);
-			bkey_err(k);
-		}));
-		bch2_trans_iter_exit(trans, &iter);
-
-		if (ret || !bch2_extents_match(k, bkey_i_to_s_c(update->k.k)))
-			break;
-
-		e = bkey_extent_init(update->op.insert_keys.top);
-		e->k.p = update->op.pos;
-
-		ret = bch2_alloc_sectors_start_trans(trans,
-				update->op.target,
-				false,
-				update->op.write_point,
-				&update->op.devs_have,
-				update->op.nr_replicas,
-				update->op.nr_replicas,
-				update->op.watermark,
-				0, &cl, &wp);
-		if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) {
-			bch2_trans_unlock(trans);
-			closure_sync(&cl);
-			continue;
-		}
-
-		bch_err_fn_ratelimited(c, ret);
-
-		if (ret)
-			return;
-
-		sectors = min(sectors, wp->sectors_free);
-
-		bch2_key_resize(&e->k, sectors);
-
-		bch2_open_bucket_get(c, wp, &update->op.open_buckets);
-		bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
-		bch2_alloc_sectors_done(c, wp);
-
-		bio_advance(bio, sectors << 9);
-		update->op.pos.offset += sectors;
-
-		extent_for_each_ptr(extent_i_to_s(e), ptr)
-			ptr->unwritten = true;
-		bch2_keylist_push(&update->op.insert_keys);
-
-		ret = __bch2_data_update_index_update(trans, &update->op);
-
-		bch2_open_buckets_put(c, &update->op.open_buckets);
-
-		if (ret)
-			break;
-	}
-
-	if (closure_nr_remaining(&cl) != 1) {
-		bch2_trans_unlock(trans);
-		closure_sync(&cl);
-	}
-}
-
-int bch2_extent_drop_ptrs(struct btree_trans *trans,
-			  struct btree_iter *iter,
-			  struct bkey_s_c k,
-			  struct data_update_opts data_opts)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_i *n;
-	int ret;
-
-	n = bch2_bkey_make_mut_noupdate(trans, k);
-	ret = PTR_ERR_OR_ZERO(n);
-	if (ret)
-		return ret;
-
-	while (data_opts.kill_ptrs) {
-		unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
-
-		bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
-		data_opts.kill_ptrs ^= 1U << drop;
-	}
-
-	/*
-	 * If the new extent no longer has any pointers, bch2_extent_normalize()
-	 * will do the appropriate thing with it (turning it into a
-	 * KEY_TYPE_error key, or just a discard if it was a cached extent)
-	 */
-	bch2_extent_normalize(c, bkey_i_to_s(n));
-
-	/*
-	 * Since we're not inserting through an extent iterator
-	 * (BTREE_ITER_all_snapshots iterators aren't extent iterators),
-	 * we aren't using the extent overwrite path to delete, we're
-	 * just using the normal key deletion path:
-	 */
-	if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_is_extents))
-		n->k.size = 0;
-
-	return bch2_trans_relock(trans) ?:
-		bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
-		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-}
-
-int bch2_data_update_init(struct btree_trans *trans,
-			  struct btree_iter *iter,
-			  struct moving_context *ctxt,
-			  struct data_update *m,
-			  struct write_point_specifier wp,
-			  struct bch_io_opts io_opts,
-			  struct data_update_opts data_opts,
-			  enum btree_id btree_id,
-			  struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
-	unsigned ptrs_locked = 0;
-	int ret = 0;
-
-	/*
-	 * fs is corrupt  we have a key for a snapshot node that doesn't exist,
-	 * and we have to check for this because we go rw before repairing the
-	 * snapshots table - just skip it, we can move it later.
-	 */
-	if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot)))
-		return -BCH_ERR_data_update_done;
-
-	bch2_bkey_buf_init(&m->k);
-	bch2_bkey_buf_reassemble(&m->k, c, k);
-	m->btree_id	= btree_id;
-	m->data_opts	= data_opts;
-	m->ctxt		= ctxt;
-	m->stats	= ctxt ? ctxt->stats : NULL;
-
-	bch2_write_op_init(&m->op, c, io_opts);
-	m->op.pos	= bkey_start_pos(k.k);
-	m->op.version	= k.k->version;
-	m->op.target	= data_opts.target;
-	m->op.write_point = wp;
-	m->op.nr_replicas = 0;
-	m->op.flags	|= BCH_WRITE_PAGES_STABLE|
-		BCH_WRITE_PAGES_OWNED|
-		BCH_WRITE_DATA_ENCODED|
-		BCH_WRITE_MOVE|
-		m->data_opts.write_flags;
-	m->op.compression_opt	= background_compression(io_opts);
-	m->op.watermark		= m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
-
-	bkey_for_each_ptr(ptrs, ptr) {
-		if (!bch2_dev_tryget(c, ptr->dev)) {
-			bkey_for_each_ptr(ptrs, ptr2) {
-				if (ptr2 == ptr)
-					break;
-				bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev));
-			}
-			return -BCH_ERR_data_update_done;
-		}
-	}
-
-	unsigned durability_have = 0, durability_removing = 0;
-
-	i = 0;
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev);
-		struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr);
-		bool locked;
-
-		rcu_read_lock();
-		if (((1U << i) & m->data_opts.rewrite_ptrs)) {
-			BUG_ON(p.ptr.cached);
-
-			if (crc_is_compressed(p.crc))
-				reserve_sectors += k.k->size;
-
-			m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
-			durability_removing += bch2_extent_ptr_desired_durability(c, &p);
-		} else if (!p.ptr.cached &&
-			   !((1U << i) & m->data_opts.kill_ptrs)) {
-			bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
-			durability_have += bch2_extent_ptr_durability(c, &p);
-		}
-		rcu_read_unlock();
-
-		/*
-		 * op->csum_type is normally initialized from the fs/file's
-		 * current options - but if an extent is encrypted, we require
-		 * that it stays encrypted:
-		 */
-		if (bch2_csum_type_is_encryption(p.crc.csum_type)) {
-			m->op.nonce	= p.crc.nonce + p.crc.offset;
-			m->op.csum_type = p.crc.csum_type;
-		}
-
-		if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
-			m->op.incompressible = true;
-
-		if (c->opts.nocow_enabled) {
-			if (ctxt) {
-				move_ctxt_wait_event(ctxt,
-						(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
-									  bucket, 0)) ||
-						list_empty(&ctxt->ios));
-
-				if (!locked)
-					bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
-			} else {
-				if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) {
-					ret = -BCH_ERR_nocow_lock_blocked;
-					goto err;
-				}
-			}
-			ptrs_locked |= (1U << i);
-		}
-
-		i++;
-	}
-
-	unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have));
-
-	/*
-	 * If current extent durability is less than io_opts.data_replicas,
-	 * we're not trying to rereplicate the extent up to data_replicas here -
-	 * unless extra_replicas was specified
-	 *
-	 * Increasing replication is an explicit operation triggered by
-	 * rereplicate, currently, so that users don't get an unexpected -ENOSPC
-	 */
-	if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) &&
-	    !durability_required) {
-		m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
-		m->data_opts.rewrite_ptrs = 0;
-		/* if iter == NULL, it's just a promote */
-		if (iter)
-			ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts);
-		goto done;
-	}
-
-	m->op.nr_replicas = min(durability_removing, durability_required) +
-		m->data_opts.extra_replicas;
-
-	/*
-	 * If device(s) were set to durability=0 after data was written to them
-	 * we can end up with a duribilty=0 extent, and the normal algorithm
-	 * that tries not to increase durability doesn't work:
-	 */
-	if (!(durability_have + durability_removing))
-		m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1);
-
-	m->op.nr_replicas_required = m->op.nr_replicas;
-
-	if (reserve_sectors) {
-		ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
-				m->data_opts.extra_replicas
-				? 0
-				: BCH_DISK_RESERVATION_NOFAIL);
-		if (ret)
-			goto err;
-	}
-
-	if (bkey_extent_is_unwritten(k)) {
-		bch2_update_unwritten_extent(trans, m);
-		goto done;
-	}
-
-	return 0;
-err:
-	i = 0;
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev);
-		struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr);
-		if ((1U << i) & ptrs_locked)
-			bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
-		bch2_dev_put(ca);
-		i++;
-	}
-
-	bch2_bkey_buf_exit(&m->k, c);
-	bch2_bio_free_pages_pool(c, &m->op.wbio.bio);
-	return ret;
-done:
-	bch2_data_update_exit(m);
-	return ret ?: -BCH_ERR_data_update_done;
-}
-
-void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	unsigned i = 0;
-
-	bkey_for_each_ptr(ptrs, ptr) {
-		if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) {
-			opts->kill_ptrs |= 1U << i;
-			opts->rewrite_ptrs ^= 1U << i;
-		}
-
-		i++;
-	}
-}
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
deleted file mode 100644
index 991095bbd469..000000000000
--- a/fs/bcachefs/data_update.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _BCACHEFS_DATA_UPDATE_H
-#define _BCACHEFS_DATA_UPDATE_H
-
-#include "bkey_buf.h"
-#include "io_write_types.h"
-
-struct moving_context;
-
-struct data_update_opts {
-	unsigned	rewrite_ptrs;
-	unsigned	kill_ptrs;
-	u16		target;
-	u8		extra_replicas;
-	unsigned	btree_insert_flags;
-	unsigned	write_flags;
-};
-
-struct data_update {
-	/* extent being updated: */
-	enum btree_id		btree_id;
-	struct bkey_buf		k;
-	struct data_update_opts	data_opts;
-	struct moving_context	*ctxt;
-	struct bch_move_stats	*stats;
-	struct bch_write_op	op;
-};
-
-int bch2_data_update_index_update(struct bch_write_op *);
-
-void bch2_data_update_read_done(struct data_update *,
-				struct bch_extent_crc_unpacked);
-
-int bch2_extent_drop_ptrs(struct btree_trans *,
-			  struct btree_iter *,
-			  struct bkey_s_c,
-			  struct data_update_opts);
-
-void bch2_data_update_exit(struct data_update *);
-int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
-			  struct moving_context *,
-			  struct data_update *,
-			  struct write_point_specifier,
-			  struct bch_io_opts, struct data_update_opts,
-			  enum btree_id, struct bkey_s_c);
-void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);
-
-#endif /* _BCACHEFS_DATA_UPDATE_H */
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
deleted file mode 100644
index 51cbf3928361..000000000000
--- a/fs/bcachefs/debug.c
+++ /dev/null
@@ -1,942 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Assorted bcachefs debug code
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "debug.h"
-#include "error.h"
-#include "extents.h"
-#include "fsck.h"
-#include "inode.h"
-#include "super.h"
-
-#include <linux/console.h>
-#include <linux/debugfs.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/seq_file.h>
-
-static struct dentry *bch_debug;
-
-static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
-				      struct extent_ptr_decoded pick)
-{
-	struct btree *v = c->verify_data;
-	struct btree_node *n_ondisk = c->verify_ondisk;
-	struct btree_node *n_sorted = c->verify_data->data;
-	struct bset *sorted, *inmemory = &b->data->keys;
-	struct bio *bio;
-	bool failed = false, saw_error = false;
-
-	struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
-	if (!ca)
-		return false;
-
-	bio = bio_alloc_bioset(ca->disk_sb.bdev,
-			       buf_pages(n_sorted, btree_buf_bytes(b)),
-			       REQ_OP_READ|REQ_META,
-			       GFP_NOFS,
-			       &c->btree_bio);
-	bio->bi_iter.bi_sector	= pick.ptr.offset;
-	bch2_bio_map(bio, n_sorted, btree_buf_bytes(b));
-
-	submit_bio_wait(bio);
-
-	bio_put(bio);
-	percpu_ref_put(&ca->io_ref);
-
-	memcpy(n_ondisk, n_sorted, btree_buf_bytes(b));
-
-	v->written = 0;
-	if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error)
-		return false;
-
-	n_sorted = c->verify_data->data;
-	sorted = &n_sorted->keys;
-
-	if (inmemory->u64s != sorted->u64s ||
-	    memcmp(inmemory->start,
-		   sorted->start,
-		   vstruct_end(inmemory) - (void *) inmemory->start)) {
-		unsigned offset = 0, sectors;
-		struct bset *i;
-		unsigned j;
-
-		console_lock();
-
-		printk(KERN_ERR "*** in memory:\n");
-		bch2_dump_bset(c, b, inmemory, 0);
-
-		printk(KERN_ERR "*** read back in:\n");
-		bch2_dump_bset(c, v, sorted, 0);
-
-		while (offset < v->written) {
-			if (!offset) {
-				i = &n_ondisk->keys;
-				sectors = vstruct_blocks(n_ondisk, c->block_bits) <<
-					c->block_bits;
-			} else {
-				struct btree_node_entry *bne =
-					(void *) n_ondisk + (offset << 9);
-				i = &bne->keys;
-
-				sectors = vstruct_blocks(bne, c->block_bits) <<
-					c->block_bits;
-			}
-
-			printk(KERN_ERR "*** on disk block %u:\n", offset);
-			bch2_dump_bset(c, b, i, offset);
-
-			offset += sectors;
-		}
-
-		for (j = 0; j < le16_to_cpu(inmemory->u64s); j++)
-			if (inmemory->_data[j] != sorted->_data[j])
-				break;
-
-		console_unlock();
-		bch_err(c, "verify failed at key %u", j);
-
-		failed = true;
-	}
-
-	if (v->written != b->written) {
-		bch_err(c, "written wrong: expected %u, got %u",
-			b->written, v->written);
-		failed = true;
-	}
-
-	return failed;
-}
-
-void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
-{
-	struct bkey_ptrs_c ptrs;
-	struct extent_ptr_decoded p;
-	const union bch_extent_entry *entry;
-	struct btree *v;
-	struct bset *inmemory = &b->data->keys;
-	struct bkey_packed *k;
-	bool failed = false;
-
-	if (c->opts.nochanges)
-		return;
-
-	bch2_btree_node_io_lock(b);
-	mutex_lock(&c->verify_lock);
-
-	if (!c->verify_ondisk) {
-		c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
-		if (!c->verify_ondisk)
-			goto out;
-	}
-
-	if (!c->verify_data) {
-		c->verify_data = __bch2_btree_node_mem_alloc(c);
-		if (!c->verify_data)
-			goto out;
-
-		list_del_init(&c->verify_data->list);
-	}
-
-	BUG_ON(b->nsets != 1);
-
-	for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k))
-		if (k->type == KEY_TYPE_btree_ptr_v2)
-			((struct bch_btree_ptr_v2 *) bkeyp_val(&b->format, k))->mem_ptr = 0;
-
-	v = c->verify_data;
-	bkey_copy(&v->key, &b->key);
-	v->c.level	= b->c.level;
-	v->c.btree_id	= b->c.btree_id;
-	bch2_btree_keys_init(v);
-
-	ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key));
-	bkey_for_each_ptr_decode(&b->key.k, ptrs, p, entry)
-		failed |= bch2_btree_verify_replica(c, b, p);
-
-	if (failed) {
-		struct printbuf buf = PRINTBUF;
-
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-		bch2_fs_fatal_error(c, ": btree node verify failed for: %s\n", buf.buf);
-		printbuf_exit(&buf);
-	}
-out:
-	mutex_unlock(&c->verify_lock);
-	bch2_btree_node_io_unlock(b);
-}
-
-void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
-				    const struct btree *b)
-{
-	struct btree_node *n_ondisk = NULL;
-	struct extent_ptr_decoded pick;
-	struct bch_dev *ca;
-	struct bio *bio = NULL;
-	unsigned offset = 0;
-	int ret;
-
-	if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) {
-		prt_printf(out, "error getting device to read from: invalid device\n");
-		return;
-	}
-
-	ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
-	if (!ca) {
-		prt_printf(out, "error getting device to read from: not online\n");
-		return;
-	}
-
-	n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
-	if (!n_ondisk) {
-		prt_printf(out, "memory allocation failure\n");
-		goto out;
-	}
-
-	bio = bio_alloc_bioset(ca->disk_sb.bdev,
-			       buf_pages(n_ondisk, btree_buf_bytes(b)),
-			       REQ_OP_READ|REQ_META,
-			       GFP_NOFS,
-			       &c->btree_bio);
-	bio->bi_iter.bi_sector	= pick.ptr.offset;
-	bch2_bio_map(bio, n_ondisk, btree_buf_bytes(b));
-
-	ret = submit_bio_wait(bio);
-	if (ret) {
-		prt_printf(out, "IO error reading btree node: %s\n", bch2_err_str(ret));
-		goto out;
-	}
-
-	while (offset < btree_sectors(c)) {
-		struct bset *i;
-		struct nonce nonce;
-		struct bch_csum csum;
-		struct bkey_packed *k;
-		unsigned sectors;
-
-		if (!offset) {
-			i = &n_ondisk->keys;
-
-			if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) {
-				prt_printf(out, "unknown checksum type at offset %u: %llu\n",
-					   offset, BSET_CSUM_TYPE(i));
-				goto out;
-			}
-
-			nonce = btree_nonce(i, offset << 9);
-			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, n_ondisk);
-
-			if (bch2_crc_cmp(csum, n_ondisk->csum)) {
-				prt_printf(out, "invalid checksum\n");
-				goto out;
-			}
-
-			bset_encrypt(c, i, offset << 9);
-
-			sectors = vstruct_sectors(n_ondisk, c->block_bits);
-		} else {
-			struct btree_node_entry *bne = (void *) n_ondisk + (offset << 9);
-
-			i = &bne->keys;
-
-			if (i->seq != n_ondisk->keys.seq)
-				break;
-
-			if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) {
-				prt_printf(out, "unknown checksum type at offset %u: %llu\n",
-					   offset, BSET_CSUM_TYPE(i));
-				goto out;
-			}
-
-			nonce = btree_nonce(i, offset << 9);
-			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
-
-			if (bch2_crc_cmp(csum, bne->csum)) {
-				prt_printf(out, "invalid checksum");
-				goto out;
-			}
-
-			bset_encrypt(c, i, offset << 9);
-
-			sectors = vstruct_sectors(bne, c->block_bits);
-		}
-
-		prt_printf(out, "  offset %u version %u, journal seq %llu\n",
-			   offset,
-			   le16_to_cpu(i->version),
-			   le64_to_cpu(i->journal_seq));
-		offset += sectors;
-
-		printbuf_indent_add(out, 4);
-
-		for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) {
-			struct bkey u;
-
-			bch2_bkey_val_to_text(out, c, bkey_disassemble(b, k, &u));
-			prt_newline(out);
-		}
-
-		printbuf_indent_sub(out, 4);
-	}
-out:
-	if (bio)
-		bio_put(bio);
-	kvfree(n_ondisk);
-	percpu_ref_put(&ca->io_ref);
-}
-
-#ifdef CONFIG_DEBUG_FS
-
-/* XXX: bch_fs refcounting */
-
-struct dump_iter {
-	struct bch_fs		*c;
-	enum btree_id		id;
-	struct bpos		from;
-	struct bpos		prev_node;
-	u64			iter;
-
-	struct printbuf		buf;
-
-	char __user		*ubuf;	/* destination user buffer */
-	size_t			size;	/* size of requested read */
-	ssize_t			ret;	/* bytes read so far */
-};
-
-static ssize_t flush_buf(struct dump_iter *i)
-{
-	if (i->buf.pos) {
-		size_t bytes = min_t(size_t, i->buf.pos, i->size);
-		int copied = bytes - copy_to_user(i->ubuf, i->buf.buf, bytes);
-
-		i->ret	 += copied;
-		i->ubuf	 += copied;
-		i->size	 -= copied;
-		i->buf.pos -= copied;
-		memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos);
-
-		if (copied != bytes)
-			return -EFAULT;
-	}
-
-	return i->size ? 0 : i->ret;
-}
-
-static int bch2_dump_open(struct inode *inode, struct file *file)
-{
-	struct btree_debug *bd = inode->i_private;
-	struct dump_iter *i;
-
-	i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
-	if (!i)
-		return -ENOMEM;
-
-	file->private_data = i;
-	i->from = POS_MIN;
-	i->iter	= 0;
-	i->c	= container_of(bd, struct bch_fs, btree_debug[bd->id]);
-	i->id	= bd->id;
-	i->buf	= PRINTBUF;
-
-	return 0;
-}
-
-static int bch2_dump_release(struct inode *inode, struct file *file)
-{
-	struct dump_iter *i = file->private_data;
-
-	printbuf_exit(&i->buf);
-	kfree(i);
-	return 0;
-}
-
-static ssize_t bch2_read_btree(struct file *file, char __user *buf,
-			       size_t size, loff_t *ppos)
-{
-	struct dump_iter *i = file->private_data;
-
-	i->ubuf = buf;
-	i->size	= size;
-	i->ret	= 0;
-
-	return flush_buf(i) ?:
-		bch2_trans_run(i->c,
-			for_each_btree_key(trans, iter, i->id, i->from,
-					   BTREE_ITER_prefetch|
-					   BTREE_ITER_all_snapshots, k, ({
-				bch2_bkey_val_to_text(&i->buf, i->c, k);
-				prt_newline(&i->buf);
-				bch2_trans_unlock(trans);
-				i->from = bpos_successor(iter.pos);
-				flush_buf(i);
-			}))) ?:
-		i->ret;
-}
-
-static const struct file_operations btree_debug_ops = {
-	.owner		= THIS_MODULE,
-	.open		= bch2_dump_open,
-	.release	= bch2_dump_release,
-	.read		= bch2_read_btree,
-};
-
-static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
-				       size_t size, loff_t *ppos)
-{
-	struct dump_iter *i = file->private_data;
-	struct btree_trans *trans;
-	struct btree_iter iter;
-	struct btree *b;
-	ssize_t ret;
-
-	i->ubuf = buf;
-	i->size	= size;
-	i->ret	= 0;
-
-	ret = flush_buf(i);
-	if (ret)
-		return ret;
-
-	if (bpos_eq(SPOS_MAX, i->from))
-		return i->ret;
-
-	trans = bch2_trans_get(i->c);
-retry:
-	bch2_trans_begin(trans);
-
-	for_each_btree_node(trans, iter, i->id, i->from, 0, b, ret) {
-		bch2_btree_node_to_text(&i->buf, i->c, b);
-		i->from = !bpos_eq(SPOS_MAX, b->key.k.p)
-			? bpos_successor(b->key.k.p)
-			: b->key.k.p;
-
-		ret = drop_locks_do(trans, flush_buf(i));
-		if (ret)
-			break;
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	bch2_trans_put(trans);
-
-	if (!ret)
-		ret = flush_buf(i);
-
-	return ret ?: i->ret;
-}
-
-static const struct file_operations btree_format_debug_ops = {
-	.owner		= THIS_MODULE,
-	.open		= bch2_dump_open,
-	.release	= bch2_dump_release,
-	.read		= bch2_read_btree_formats,
-};
-
-static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
-				       size_t size, loff_t *ppos)
-{
-	struct dump_iter *i = file->private_data;
-
-	i->ubuf = buf;
-	i->size	= size;
-	i->ret	= 0;
-
-	return flush_buf(i) ?:
-		bch2_trans_run(i->c,
-			for_each_btree_key(trans, iter, i->id, i->from,
-					   BTREE_ITER_prefetch|
-					   BTREE_ITER_all_snapshots, k, ({
-				struct btree_path_level *l =
-					&btree_iter_path(trans, &iter)->l[0];
-				struct bkey_packed *_k =
-					bch2_btree_node_iter_peek(&l->iter, l->b);
-
-				if (bpos_gt(l->b->key.k.p, i->prev_node)) {
-					bch2_btree_node_to_text(&i->buf, i->c, l->b);
-					i->prev_node = l->b->key.k.p;
-				}
-
-				bch2_bfloat_to_text(&i->buf, l->b, _k);
-				bch2_trans_unlock(trans);
-				i->from = bpos_successor(iter.pos);
-				flush_buf(i);
-			}))) ?:
-		i->ret;
-}
-
-static const struct file_operations bfloat_failed_debug_ops = {
-	.owner		= THIS_MODULE,
-	.open		= bch2_dump_open,
-	.release	= bch2_dump_release,
-	.read		= bch2_read_bfloat_failed,
-};
-
-static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
-					   struct btree *b)
-{
-	if (!out->nr_tabstops)
-		printbuf_tabstop_push(out, 32);
-
-	prt_printf(out, "%px btree=%s l=%u\n", b, bch2_btree_id_str(b->c.btree_id), b->c.level);
-
-	printbuf_indent_add(out, 2);
-
-	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
-	prt_newline(out);
-
-	prt_printf(out, "flags:\t");
-	prt_bitflags(out, bch2_btree_node_flags, b->flags);
-	prt_newline(out);
-
-	prt_printf(out, "pcpu read locks:\t%u\n",	b->c.lock.readers != NULL);
-	prt_printf(out, "written:\t%u\n",		b->written);
-	prt_printf(out, "writes blocked:\t%u\n",	!list_empty_careful(&b->write_blocked));
-	prt_printf(out, "will make reachable:\t%lx\n",	b->will_make_reachable);
-
-	prt_printf(out, "journal pin %px:\t%llu\n",
-		   &b->writes[0].journal, b->writes[0].journal.seq);
-	prt_printf(out, "journal pin %px:\t%llu\n",
-		   &b->writes[1].journal, b->writes[1].journal.seq);
-
-	printbuf_indent_sub(out, 2);
-}
-
-static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
-					    size_t size, loff_t *ppos)
-{
-	struct dump_iter *i = file->private_data;
-	struct bch_fs *c = i->c;
-	bool done = false;
-	ssize_t ret = 0;
-
-	i->ubuf = buf;
-	i->size	= size;
-	i->ret	= 0;
-
-	do {
-		struct bucket_table *tbl;
-		struct rhash_head *pos;
-		struct btree *b;
-
-		ret = flush_buf(i);
-		if (ret)
-			return ret;
-
-		rcu_read_lock();
-		i->buf.atomic++;
-		tbl = rht_dereference_rcu(c->btree_cache.table.tbl,
-					  &c->btree_cache.table);
-		if (i->iter < tbl->size) {
-			rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
-				bch2_cached_btree_node_to_text(&i->buf, c, b);
-			i->iter++;
-		} else {
-			done = true;
-		}
-		--i->buf.atomic;
-		rcu_read_unlock();
-	} while (!done);
-
-	if (i->buf.allocation_failure)
-		ret = -ENOMEM;
-
-	if (!ret)
-		ret = flush_buf(i);
-
-	return ret ?: i->ret;
-}
-
-static const struct file_operations cached_btree_nodes_ops = {
-	.owner		= THIS_MODULE,
-	.open		= bch2_dump_open,
-	.release	= bch2_dump_release,
-	.read		= bch2_cached_btree_nodes_read,
-};
-
-static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
-					    size_t size, loff_t *ppos)
-{
-	struct dump_iter *i = file->private_data;
-	struct bch_fs *c = i->c;
-	struct btree_trans *trans;
-	ssize_t ret = 0;
-	u32 seq;
-
-	i->ubuf = buf;
-	i->size	= size;
-	i->ret	= 0;
-restart:
-	seqmutex_lock(&c->btree_trans_lock);
-	list_for_each_entry(trans, &c->btree_trans_list, list) {
-		struct task_struct *task = READ_ONCE(trans->locking_wait.task);
-
-		if (!task || task->pid <= i->iter)
-			continue;
-
-		closure_get(&trans->ref);
-		seq = seqmutex_seq(&c->btree_trans_lock);
-		seqmutex_unlock(&c->btree_trans_lock);
-
-		ret = flush_buf(i);
-		if (ret) {
-			closure_put(&trans->ref);
-			goto unlocked;
-		}
-
-		bch2_btree_trans_to_text(&i->buf, trans);
-
-		prt_printf(&i->buf, "backtrace:\n");
-		printbuf_indent_add(&i->buf, 2);
-		bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL);
-		printbuf_indent_sub(&i->buf, 2);
-		prt_newline(&i->buf);
-
-		i->iter = task->pid;
-
-		closure_put(&trans->ref);
-
-		if (!seqmutex_relock(&c->btree_trans_lock, seq))
-			goto restart;
-	}
-	seqmutex_unlock(&c->btree_trans_lock);
-unlocked:
-	if (i->buf.allocation_failure)
-		ret = -ENOMEM;
-
-	if (!ret)
-		ret = flush_buf(i);
-
-	return ret ?: i->ret;
-}
-
-static const struct file_operations btree_transactions_ops = {
-	.owner		= THIS_MODULE,
-	.open		= bch2_dump_open,
-	.release	= bch2_dump_release,
-	.read		= bch2_btree_transactions_read,
-};
-
-static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
-				      size_t size, loff_t *ppos)
-{
-	struct dump_iter *i = file->private_data;
-	struct bch_fs *c = i->c;
-	bool done = false;
-	int err;
-
-	i->ubuf = buf;
-	i->size	= size;
-	i->ret	= 0;
-
-	while (1) {
-		err = flush_buf(i);
-		if (err)
-			return err;
-
-		if (!i->size)
-			break;
-
-		if (done)
-			break;
-
-		done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
-		i->iter++;
-	}
-
-	if (i->buf.allocation_failure)
-		return -ENOMEM;
-
-	return i->ret;
-}
-
-static const struct file_operations journal_pins_ops = {
-	.owner		= THIS_MODULE,
-	.open		= bch2_dump_open,
-	.release	= bch2_dump_release,
-	.read		= bch2_journal_pins_read,
-};
-
-static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf,
-				       size_t size, loff_t *ppos)
-{
-	struct dump_iter *i = file->private_data;
-	struct bch_fs *c = i->c;
-	int err;
-
-	i->ubuf = buf;
-	i->size	= size;
-	i->ret	= 0;
-
-	if (!i->iter) {
-		bch2_btree_updates_to_text(&i->buf, c);
-		i->iter++;
-	}
-
-	err = flush_buf(i);
-	if (err)
-		return err;
-
-	if (i->buf.allocation_failure)
-		return -ENOMEM;
-
-	return i->ret;
-}
-
-static const struct file_operations btree_updates_ops = {
-	.owner		= THIS_MODULE,
-	.open		= bch2_dump_open,
-	.release	= bch2_dump_release,
-	.read		= bch2_btree_updates_read,
-};
-
-static int btree_transaction_stats_open(struct inode *inode, struct file *file)
-{
-	struct bch_fs *c = inode->i_private;
-	struct dump_iter *i;
-
-	i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
-	if (!i)
-		return -ENOMEM;
-
-	i->iter = 1;
-	i->c    = c;
-	i->buf  = PRINTBUF;
-	file->private_data = i;
-
-	return 0;
-}
-
-static int btree_transaction_stats_release(struct inode *inode, struct file *file)
-{
-	struct dump_iter *i = file->private_data;
-
-	printbuf_exit(&i->buf);
-	kfree(i);
-
-	return 0;
-}
-
-static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf,
-					    size_t size, loff_t *ppos)
-{
-	struct dump_iter        *i = file->private_data;
-	struct bch_fs *c = i->c;
-	int err;
-
-	i->ubuf = buf;
-	i->size = size;
-	i->ret  = 0;
-
-	while (1) {
-		struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter];
-
-		err = flush_buf(i);
-		if (err)
-			return err;
-
-		if (!i->size)
-			break;
-
-		if (i->iter == ARRAY_SIZE(bch2_btree_transaction_fns) ||
-		    !bch2_btree_transaction_fns[i->iter])
-			break;
-
-		prt_printf(&i->buf, "%s:\n", bch2_btree_transaction_fns[i->iter]);
-		printbuf_indent_add(&i->buf, 2);
-
-		mutex_lock(&s->lock);
-
-		prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem);
-		prt_printf(&i->buf, "Transaction duration:\n");
-
-		printbuf_indent_add(&i->buf, 2);
-		bch2_time_stats_to_text(&i->buf, &s->duration);
-		printbuf_indent_sub(&i->buf, 2);
-
-		if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
-			prt_printf(&i->buf, "Lock hold times:\n");
-
-			printbuf_indent_add(&i->buf, 2);
-			bch2_time_stats_to_text(&i->buf, &s->lock_hold_times);
-			printbuf_indent_sub(&i->buf, 2);
-		}
-
-		if (s->max_paths_text) {
-			prt_printf(&i->buf, "Maximum allocated btree paths (%u):\n", s->nr_max_paths);
-
-			printbuf_indent_add(&i->buf, 2);
-			prt_str_indented(&i->buf, s->max_paths_text);
-			printbuf_indent_sub(&i->buf, 2);
-		}
-
-		mutex_unlock(&s->lock);
-
-		printbuf_indent_sub(&i->buf, 2);
-		prt_newline(&i->buf);
-		i->iter++;
-	}
-
-	if (i->buf.allocation_failure)
-		return -ENOMEM;
-
-	return i->ret;
-}
-
-static const struct file_operations btree_transaction_stats_op = {
-	.owner		= THIS_MODULE,
-	.open		= btree_transaction_stats_open,
-	.release	= btree_transaction_stats_release,
-	.read		= btree_transaction_stats_read,
-};
-
-static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
-					    size_t size, loff_t *ppos)
-{
-	struct dump_iter *i = file->private_data;
-	struct bch_fs *c = i->c;
-	struct btree_trans *trans;
-	ssize_t ret = 0;
-	u32 seq;
-
-	i->ubuf = buf;
-	i->size	= size;
-	i->ret	= 0;
-
-	if (i->iter)
-		goto out;
-restart:
-	seqmutex_lock(&c->btree_trans_lock);
-	list_for_each_entry(trans, &c->btree_trans_list, list) {
-		struct task_struct *task = READ_ONCE(trans->locking_wait.task);
-
-		if (!task || task->pid <= i->iter)
-			continue;
-
-		closure_get(&trans->ref);
-		seq = seqmutex_seq(&c->btree_trans_lock);
-		seqmutex_unlock(&c->btree_trans_lock);
-
-		ret = flush_buf(i);
-		if (ret) {
-			closure_put(&trans->ref);
-			goto out;
-		}
-
-		bch2_check_for_deadlock(trans, &i->buf);
-
-		i->iter = task->pid;
-
-		closure_put(&trans->ref);
-
-		if (!seqmutex_relock(&c->btree_trans_lock, seq))
-			goto restart;
-	}
-	seqmutex_unlock(&c->btree_trans_lock);
-out:
-	if (i->buf.allocation_failure)
-		ret = -ENOMEM;
-
-	if (!ret)
-		ret = flush_buf(i);
-
-	return ret ?: i->ret;
-}
-
-static const struct file_operations btree_deadlock_ops = {
-	.owner		= THIS_MODULE,
-	.open		= bch2_dump_open,
-	.release	= bch2_dump_release,
-	.read		= bch2_btree_deadlock_read,
-};
-
-void bch2_fs_debug_exit(struct bch_fs *c)
-{
-	if (!IS_ERR_OR_NULL(c->fs_debug_dir))
-		debugfs_remove_recursive(c->fs_debug_dir);
-}
-
-static void bch2_fs_debug_btree_init(struct bch_fs *c, struct btree_debug *bd)
-{
-	struct dentry *d;
-
-	d = debugfs_create_dir(bch2_btree_id_str(bd->id), c->btree_debug_dir);
-
-	debugfs_create_file("keys", 0400, d, bd, &btree_debug_ops);
-
-	debugfs_create_file("formats", 0400, d, bd, &btree_format_debug_ops);
-
-	debugfs_create_file("bfloat-failed", 0400, d, bd,
-			    &bfloat_failed_debug_ops);
-}
-
-void bch2_fs_debug_init(struct bch_fs *c)
-{
-	struct btree_debug *bd;
-	char name[100];
-
-	if (IS_ERR_OR_NULL(bch_debug))
-		return;
-
-	snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
-	c->fs_debug_dir = debugfs_create_dir(name, bch_debug);
-	if (IS_ERR_OR_NULL(c->fs_debug_dir))
-		return;
-
-	debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
-			    c->btree_debug, &cached_btree_nodes_ops);
-
-	debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir,
-			    c->btree_debug, &btree_transactions_ops);
-
-	debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
-			    c->btree_debug, &journal_pins_ops);
-
-	debugfs_create_file("btree_updates", 0400, c->fs_debug_dir,
-			    c->btree_debug, &btree_updates_ops);
-
-	debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
-			    c, &btree_transaction_stats_op);
-
-	debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir,
-			    c->btree_debug, &btree_deadlock_ops);
-
-	c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
-	if (IS_ERR_OR_NULL(c->btree_debug_dir))
-		return;
-
-	for (bd = c->btree_debug;
-	     bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
-	     bd++) {
-		bd->id = bd - c->btree_debug;
-		bch2_fs_debug_btree_init(c, bd);
-	}
-}
-
-#endif
-
-void bch2_debug_exit(void)
-{
-	if (!IS_ERR_OR_NULL(bch_debug))
-		debugfs_remove_recursive(bch_debug);
-}
-
-int __init bch2_debug_init(void)
-{
-	bch_debug = debugfs_create_dir("bcachefs", NULL);
-	return 0;
-}
diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h
deleted file mode 100644
index 2c37143b5fd1..000000000000
--- a/fs/bcachefs/debug.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DEBUG_H
-#define _BCACHEFS_DEBUG_H
-
-#include "bcachefs.h"
-
-struct bio;
-struct btree;
-struct bch_fs;
-
-void __bch2_btree_verify(struct bch_fs *, struct btree *);
-void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *,
-				    const struct btree *);
-
-static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
-{
-	if (bch2_verify_btree_ondisk)
-		__bch2_btree_verify(c, b);
-}
-
-#ifdef CONFIG_DEBUG_FS
-void bch2_fs_debug_exit(struct bch_fs *);
-void bch2_fs_debug_init(struct bch_fs *);
-#else
-static inline void bch2_fs_debug_exit(struct bch_fs *c) {}
-static inline void bch2_fs_debug_init(struct bch_fs *c) {}
-#endif
-
-void bch2_debug_exit(void);
-int bch2_debug_init(void);
-
-#endif /* _BCACHEFS_DEBUG_H */
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
deleted file mode 100644
index 6bbf9a7d9e4d..000000000000
--- a/fs/bcachefs/dirent.c
+++ /dev/null
@@ -1,603 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "bkey_methods.h"
-#include "btree_update.h"
-#include "extents.h"
-#include "dirent.h"
-#include "fs.h"
-#include "keylist.h"
-#include "str_hash.h"
-#include "subvolume.h"
-
-#include <linux/dcache.h>
-
-static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
-{
-	unsigned bkey_u64s = bkey_val_u64s(d.k);
-	unsigned bkey_bytes = bkey_u64s * sizeof(u64);
-	u64 last_u64 = ((u64*)d.v)[bkey_u64s - 1];
-#if CPU_BIG_ENDIAN
-	unsigned trailing_nuls = last_u64 ? __builtin_ctzll(last_u64) / 8 : 64 / 8;
-#else
-	unsigned trailing_nuls = last_u64 ? __builtin_clzll(last_u64) / 8 : 64 / 8;
-#endif
-
-	return bkey_bytes -
-		offsetof(struct bch_dirent, d_name) -
-		trailing_nuls;
-}
-
-struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d)
-{
-	return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
-}
-
-static u64 bch2_dirent_hash(const struct bch_hash_info *info,
-			    const struct qstr *name)
-{
-	struct bch_str_hash_ctx ctx;
-
-	bch2_str_hash_init(&ctx, info);
-	bch2_str_hash_update(&ctx, info, name->name, name->len);
-
-	/* [0,2) reserved for dots */
-	return max_t(u64, bch2_str_hash_end(&ctx, info), 2);
-}
-
-static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
-{
-	return bch2_dirent_hash(info, key);
-}
-
-static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
-{
-	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-	struct qstr name = bch2_dirent_get_name(d);
-
-	return bch2_dirent_hash(info, &name);
-}
-
-static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
-{
-	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
-	const struct qstr l_name = bch2_dirent_get_name(l);
-	const struct qstr *r_name = _r;
-
-	return !qstr_eq(l_name, *r_name);
-}
-
-static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
-{
-	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
-	struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
-	const struct qstr l_name = bch2_dirent_get_name(l);
-	const struct qstr r_name = bch2_dirent_get_name(r);
-
-	return !qstr_eq(l_name, r_name);
-}
-
-static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k)
-{
-	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-
-	if (d.v->d_type == DT_SUBVOL)
-		return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol;
-	return true;
-}
-
-const struct bch_hash_desc bch2_dirent_hash_desc = {
-	.btree_id	= BTREE_ID_dirents,
-	.key_type	= KEY_TYPE_dirent,
-	.hash_key	= dirent_hash_key,
-	.hash_bkey	= dirent_hash_bkey,
-	.cmp_key	= dirent_cmp_key,
-	.cmp_bkey	= dirent_cmp_bkey,
-	.is_visible	= dirent_is_visible,
-};
-
-int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k,
-			enum bch_validate_flags flags,
-			struct printbuf *err)
-{
-	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-	struct qstr d_name = bch2_dirent_get_name(d);
-	int ret = 0;
-
-	bkey_fsck_err_on(!d_name.len, c, err,
-			 dirent_empty_name,
-			 "empty name");
-
-	bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), c, err,
-			 dirent_val_too_big,
-			 "value too big (%zu > %u)",
-			 bkey_val_u64s(k.k), dirent_val_u64s(d_name.len));
-
-	/*
-	 * Check new keys don't exceed the max length
-	 * (older keys may be larger.)
-	 */
-	bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, c, err,
-			 dirent_name_too_long,
-			 "dirent name too big (%u > %u)",
-			 d_name.len, BCH_NAME_MAX);
-
-	bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), c, err,
-			 dirent_name_embedded_nul,
-			 "dirent has stray data after name's NUL");
-
-	bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) ||
-			 (d_name.len == 2 && !memcmp(d_name.name, "..", 2)), c, err,
-			 dirent_name_dot_or_dotdot,
-			 "invalid name");
-
-	bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), c, err,
-			 dirent_name_has_slash,
-			 "name with /");
-
-	bkey_fsck_err_on(d.v->d_type != DT_SUBVOL &&
-			 le64_to_cpu(d.v->d_inum) == d.k->p.inode, c, err,
-			 dirent_to_itself,
-			 "dirent points to own directory");
-fsck_err:
-	return ret;
-}
-
-void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-	struct qstr d_name = bch2_dirent_get_name(d);
-
-	prt_printf(out, "%.*s -> ", d_name.len, d_name.name);
-
-	if (d.v->d_type != DT_SUBVOL)
-		prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum));
-	else
-		prt_printf(out, "%u -> %u",
-			   le32_to_cpu(d.v->d_parent_subvol),
-			   le32_to_cpu(d.v->d_child_subvol));
-
-	prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type));
-}
-
-static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
-				subvol_inum dir, u8 type,
-				const struct qstr *name, u64 dst)
-{
-	struct bkey_i_dirent *dirent;
-	unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
-
-	if (name->len > BCH_NAME_MAX)
-		return ERR_PTR(-ENAMETOOLONG);
-
-	BUG_ON(u64s > U8_MAX);
-
-	dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
-	if (IS_ERR(dirent))
-		return dirent;
-
-	bkey_dirent_init(&dirent->k_i);
-	dirent->k.u64s = u64s;
-
-	if (type != DT_SUBVOL) {
-		dirent->v.d_inum = cpu_to_le64(dst);
-	} else {
-		dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol);
-		dirent->v.d_child_subvol = cpu_to_le32(dst);
-	}
-
-	dirent->v.d_type = type;
-
-	memcpy(dirent->v.d_name, name->name, name->len);
-	memset(dirent->v.d_name + name->len, 0,
-	       bkey_val_bytes(&dirent->k) -
-	       offsetof(struct bch_dirent, d_name) -
-	       name->len);
-
-	EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
-
-	return dirent;
-}
-
-int bch2_dirent_create_snapshot(struct btree_trans *trans,
-			u32 dir_subvol, u64 dir, u32 snapshot,
-			const struct bch_hash_info *hash_info,
-			u8 type, const struct qstr *name, u64 dst_inum,
-			u64 *dir_offset,
-			enum btree_iter_update_trigger_flags flags)
-{
-	subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir };
-	struct bkey_i_dirent *dirent;
-	int ret;
-
-	dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum);
-	ret = PTR_ERR_OR_ZERO(dirent);
-	if (ret)
-		return ret;
-
-	dirent->k.p.inode	= dir;
-	dirent->k.p.snapshot	= snapshot;
-
-	ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
-					dir_inum, snapshot, &dirent->k_i,
-					flags|BTREE_UPDATE_internal_snapshot_node);
-	*dir_offset = dirent->k.p.offset;
-
-	return ret;
-}
-
-int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
-		       const struct bch_hash_info *hash_info,
-		       u8 type, const struct qstr *name, u64 dst_inum,
-		       u64 *dir_offset,
-		       enum btree_iter_update_trigger_flags flags)
-{
-	struct bkey_i_dirent *dirent;
-	int ret;
-
-	dirent = dirent_create_key(trans, dir, type, name, dst_inum);
-	ret = PTR_ERR_OR_ZERO(dirent);
-	if (ret)
-		return ret;
-
-	ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
-			    dir, &dirent->k_i, flags);
-	*dir_offset = dirent->k.p.offset;
-
-	return ret;
-}
-
-static void dirent_copy_target(struct bkey_i_dirent *dst,
-			       struct bkey_s_c_dirent src)
-{
-	dst->v.d_inum = src.v->d_inum;
-	dst->v.d_type = src.v->d_type;
-}
-
-int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
-			    struct bkey_s_c_dirent d, subvol_inum *target)
-{
-	struct bch_subvolume s;
-	int ret = 0;
-
-	if (d.v->d_type == DT_SUBVOL &&
-	    le32_to_cpu(d.v->d_parent_subvol) != dir.subvol)
-		return 1;
-
-	if (likely(d.v->d_type != DT_SUBVOL)) {
-		target->subvol	= dir.subvol;
-		target->inum	= le64_to_cpu(d.v->d_inum);
-	} else {
-		target->subvol	= le32_to_cpu(d.v->d_child_subvol);
-
-		ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_cached, &s);
-
-		target->inum	= le64_to_cpu(s.inode);
-	}
-
-	return ret;
-}
-
-int bch2_dirent_rename(struct btree_trans *trans,
-		subvol_inum src_dir, struct bch_hash_info *src_hash,
-		subvol_inum dst_dir, struct bch_hash_info *dst_hash,
-		const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset,
-		const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
-		enum bch_rename_mode mode)
-{
-	struct btree_iter src_iter = { NULL };
-	struct btree_iter dst_iter = { NULL };
-	struct bkey_s_c old_src, old_dst = bkey_s_c_null;
-	struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
-	struct bpos dst_pos =
-		POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
-	unsigned src_update_flags = 0;
-	bool delete_src, delete_dst;
-	int ret = 0;
-
-	memset(src_inum, 0, sizeof(*src_inum));
-	memset(dst_inum, 0, sizeof(*dst_inum));
-
-	/* Lookup src: */
-	old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
-				   src_hash, src_dir, src_name,
-				   BTREE_ITER_intent);
-	ret = bkey_err(old_src);
-	if (ret)
-		goto out;
-
-	ret = bch2_dirent_read_target(trans, src_dir,
-			bkey_s_c_to_dirent(old_src), src_inum);
-	if (ret)
-		goto out;
-
-	/* Lookup dst: */
-	if (mode == BCH_RENAME) {
-		/*
-		 * Note that we're _not_ checking if the target already exists -
-		 * we're relying on the VFS to do that check for us for
-		 * correctness:
-		 */
-		ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
-				     dst_hash, dst_dir, dst_name);
-		if (ret)
-			goto out;
-	} else {
-		old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
-					    dst_hash, dst_dir, dst_name,
-					    BTREE_ITER_intent);
-		ret = bkey_err(old_dst);
-		if (ret)
-			goto out;
-
-		ret = bch2_dirent_read_target(trans, dst_dir,
-				bkey_s_c_to_dirent(old_dst), dst_inum);
-		if (ret)
-			goto out;
-	}
-
-	if (mode != BCH_RENAME_EXCHANGE)
-		*src_offset = dst_iter.pos.offset;
-
-	/* Create new dst key: */
-	new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0);
-	ret = PTR_ERR_OR_ZERO(new_dst);
-	if (ret)
-		goto out;
-
-	dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
-	new_dst->k.p = dst_iter.pos;
-
-	/* Create new src key: */
-	if (mode == BCH_RENAME_EXCHANGE) {
-		new_src = dirent_create_key(trans, src_dir, 0, src_name, 0);
-		ret = PTR_ERR_OR_ZERO(new_src);
-		if (ret)
-			goto out;
-
-		dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
-		new_src->k.p = src_iter.pos;
-	} else {
-		new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
-		ret = PTR_ERR_OR_ZERO(new_src);
-		if (ret)
-			goto out;
-
-		bkey_init(&new_src->k);
-		new_src->k.p = src_iter.pos;
-
-		if (bkey_le(dst_pos, src_iter.pos) &&
-		    bkey_lt(src_iter.pos, dst_iter.pos)) {
-			/*
-			 * We have a hash collision for the new dst key,
-			 * and new_src - the key we're deleting - is between
-			 * new_dst's hashed slot and the slot we're going to be
-			 * inserting it into - oops.  This will break the hash
-			 * table if we don't deal with it:
-			 */
-			if (mode == BCH_RENAME) {
-				/*
-				 * If we're not overwriting, we can just insert
-				 * new_dst at the src position:
-				 */
-				new_src = new_dst;
-				new_src->k.p = src_iter.pos;
-				goto out_set_src;
-			} else {
-				/* If we're overwriting, we can't insert new_dst
-				 * at a different slot because it has to
-				 * overwrite old_dst - just make sure to use a
-				 * whiteout when deleting src:
-				 */
-				new_src->k.type = KEY_TYPE_hash_whiteout;
-			}
-		} else {
-			/* Check if we need a whiteout to delete src: */
-			ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
-						       src_hash, &src_iter);
-			if (ret < 0)
-				goto out;
-
-			if (ret)
-				new_src->k.type = KEY_TYPE_hash_whiteout;
-		}
-	}
-
-	if (new_dst->v.d_type == DT_SUBVOL)
-		new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol);
-
-	if ((mode == BCH_RENAME_EXCHANGE) &&
-	    new_src->v.d_type == DT_SUBVOL)
-		new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol);
-
-	ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
-	if (ret)
-		goto out;
-out_set_src:
-	/*
-	 * If we're deleting a subvolume we need to really delete the dirent,
-	 * not just emit a whiteout in the current snapshot - there can only be
-	 * single dirent that points to a given subvolume.
-	 *
-	 * IOW, we don't maintain multiple versions in different snapshots of
-	 * dirents that point to subvolumes - dirents that point to subvolumes
-	 * are only visible in one particular subvolume so it's not necessary,
-	 * and it would be particularly confusing for fsck to have to deal with.
-	 */
-	delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL &&
-		new_src->k.p.snapshot != old_src.k->p.snapshot;
-
-	delete_dst = old_dst.k &&
-		bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL &&
-		new_dst->k.p.snapshot != old_dst.k->p.snapshot;
-
-	if (!delete_src || !bkey_deleted(&new_src->k)) {
-		ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
-		if (ret)
-			goto out;
-	}
-
-	if (delete_src) {
-		bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
-		ret =   bch2_btree_iter_traverse(&src_iter) ?:
-			bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node);
-		if (ret)
-			goto out;
-	}
-
-	if (delete_dst) {
-		bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot);
-		ret =   bch2_btree_iter_traverse(&dst_iter) ?:
-			bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node);
-		if (ret)
-			goto out;
-	}
-
-	if (mode == BCH_RENAME_EXCHANGE)
-		*src_offset = new_src->k.p.offset;
-	*dst_offset = new_dst->k.p.offset;
-out:
-	bch2_trans_iter_exit(trans, &src_iter);
-	bch2_trans_iter_exit(trans, &dst_iter);
-	return ret;
-}
-
-int bch2_dirent_lookup_trans(struct btree_trans *trans,
-			     struct btree_iter *iter,
-			     subvol_inum dir,
-			     const struct bch_hash_info *hash_info,
-			     const struct qstr *name, subvol_inum *inum,
-			     unsigned flags)
-{
-	struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
-					     hash_info, dir, name, flags);
-	int ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum);
-	if (ret > 0)
-		ret = -ENOENT;
-err:
-	if (ret)
-		bch2_trans_iter_exit(trans, iter);
-	return ret;
-}
-
-u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
-		       const struct bch_hash_info *hash_info,
-		       const struct qstr *name, subvol_inum *inum)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter = { NULL };
-
-	int ret = lockrestart_do(trans,
-		bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
-	bch2_trans_iter_exit(trans, &iter);
-	bch2_trans_put(trans);
-	return ret;
-}
-
-int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 snapshot)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
-			   SPOS(dir, 0, snapshot),
-			   POS(dir, U64_MAX), 0, k, ret)
-		if (k.k->type == KEY_TYPE_dirent) {
-			struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-			if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol)
-				continue;
-			ret = -BCH_ERR_ENOTEMPTY_dir_not_empty;
-			break;
-		}
-	bch2_trans_iter_exit(trans, &iter);
-
-	return ret;
-}
-
-int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
-{
-	u32 snapshot;
-
-	return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?:
-		bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot);
-}
-
-static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subvol_inum target)
-{
-	struct qstr name = bch2_dirent_get_name(d);
-	bool ret = dir_emit(ctx, name.name,
-		      name.len,
-		      target.inum,
-		      vfs_d_type(d.v->d_type));
-	if (ret)
-		ctx->pos = d.k->p.offset + 1;
-	return ret;
-}
-
-int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	subvol_inum target;
-	u32 snapshot;
-	struct bkey_buf sk;
-	int ret;
-
-	bch2_bkey_buf_init(&sk);
-retry:
-	bch2_trans_begin(trans);
-
-	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (ret)
-		goto err;
-
-	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
-			   SPOS(inum.inum, ctx->pos, snapshot),
-			   POS(inum.inum, U64_MAX), 0, k, ret) {
-		if (k.k->type != KEY_TYPE_dirent)
-			continue;
-
-		/* dir_emit() can fault and block: */
-		bch2_bkey_buf_reassemble(&sk, c, k);
-		struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k);
-
-		ret = bch2_dirent_read_target(trans, inum, dirent, &target);
-		if (ret < 0)
-			break;
-		if (ret)
-			continue;
-
-		/*
-		 * read_target looks up subvolumes, we can overflow paths if the
-		 * directory has many subvolumes in it
-		 *
-		 * XXX: btree_trans_too_many_iters() is something we'd like to
-		 * get rid of, and there's no good reason to be using it here
-		 * except that we don't yet have a for_each_btree_key() helper
-		 * that does subvolume_get_snapshot().
-		 */
-		ret =   drop_locks_do(trans,
-				bch2_dir_emit(ctx, dirent, target)) ?:
-			btree_trans_too_many_iters(trans);
-		if (ret) {
-			ret = ret < 0 ? ret : 0;
-			break;
-		}
-	}
-	bch2_trans_iter_exit(trans, &iter);
-err:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	bch2_trans_put(trans);
-	bch2_bkey_buf_exit(&sk, c);
-
-	return ret;
-}
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
deleted file mode 100644
index 24037e6e0a09..000000000000
--- a/fs/bcachefs/dirent.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DIRENT_H
-#define _BCACHEFS_DIRENT_H
-
-#include "str_hash.h"
-
-enum bch_validate_flags;
-extern const struct bch_hash_desc bch2_dirent_hash_desc;
-
-int bch2_dirent_invalid(struct bch_fs *, struct bkey_s_c,
-			enum bch_validate_flags, struct printbuf *);
-void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_dirent ((struct bkey_ops) {	\
-	.key_invalid	= bch2_dirent_invalid,		\
-	.val_to_text	= bch2_dirent_to_text,		\
-	.min_val_size	= 16,				\
-})
-
-struct qstr;
-struct file;
-struct dir_context;
-struct bch_fs;
-struct bch_hash_info;
-struct bch_inode_info;
-
-struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d);
-
-static inline unsigned dirent_val_u64s(unsigned len)
-{
-	return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len,
-			    sizeof(u64));
-}
-
-int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
-			    struct bkey_s_c_dirent, subvol_inum *);
-
-int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
-			const struct bch_hash_info *, u8,
-			const struct qstr *, u64, u64 *,
-			enum btree_iter_update_trigger_flags);
-int bch2_dirent_create(struct btree_trans *, subvol_inum,
-		       const struct bch_hash_info *, u8,
-		       const struct qstr *, u64, u64 *,
-		       enum btree_iter_update_trigger_flags);
-
-static inline unsigned vfs_d_type(unsigned type)
-{
-	return type == DT_SUBVOL ? DT_DIR : type;
-}
-
-enum bch_rename_mode {
-	BCH_RENAME,
-	BCH_RENAME_OVERWRITE,
-	BCH_RENAME_EXCHANGE,
-};
-
-int bch2_dirent_rename(struct btree_trans *,
-		       subvol_inum, struct bch_hash_info *,
-		       subvol_inum, struct bch_hash_info *,
-		       const struct qstr *, subvol_inum *, u64 *,
-		       const struct qstr *, subvol_inum *, u64 *,
-		       enum bch_rename_mode);
-
-int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
-			       subvol_inum, const struct bch_hash_info *,
-			       const struct qstr *, subvol_inum *, unsigned);
-u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
-		       const struct bch_hash_info *,
-		       const struct qstr *, subvol_inum *);
-
-int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32);
-int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
-int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
-
-#endif /* _BCACHEFS_DIRENT_H */
diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h
deleted file mode 100644
index 5e116b88e814..000000000000
--- a/fs/bcachefs/dirent_format.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DIRENT_FORMAT_H
-#define _BCACHEFS_DIRENT_FORMAT_H
-
-/*
- * Dirents (and xattrs) have to implement string lookups; since our b-tree
- * doesn't support arbitrary length strings for the key, we instead index by a
- * 64 bit hash (currently truncated sha1) of the string, stored in the offset
- * field of the key - using linear probing to resolve hash collisions. This also
- * provides us with the readdir cookie posix requires.
- *
- * Linear probing requires us to use whiteouts for deletions, in the event of a
- * collision:
- */
-
-struct bch_dirent {
-	struct bch_val		v;
-
-	/* Target inode number: */
-	union {
-	__le64			d_inum;
-	struct {		/* DT_SUBVOL */
-	__le32			d_child_subvol;
-	__le32			d_parent_subvol;
-	};
-	};
-
-	/*
-	 * Copy of mode bits 12-15 from the target inode - so userspace can get
-	 * the filetype without having to do a stat()
-	 */
-	__u8			d_type;
-
-	__u8			d_name[];
-} __packed __aligned(8);
-
-#define DT_SUBVOL	16
-#define BCH_DT_MAX	17
-
-#define BCH_NAME_MAX	512
-
-#endif /* _BCACHEFS_DIRENT_FORMAT_H */
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
deleted file mode 100644
index 521a86df5e52..000000000000
--- a/fs/bcachefs/disk_groups.c
+++ /dev/null
@@ -1,616 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "disk_groups.h"
-#include "sb-members.h"
-#include "super-io.h"
-
-#include <linux/sort.h>
-
-static int group_cmp(const void *_l, const void *_r)
-{
-	const struct bch_disk_group *l = _l;
-	const struct bch_disk_group *r = _r;
-
-	return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) -
-		(BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?:
-		((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) -
-		 (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?:
-		strncmp(l->label, r->label, sizeof(l->label));
-}
-
-static int bch2_sb_disk_groups_validate(struct bch_sb *sb, struct bch_sb_field *f,
-				enum bch_validate_flags flags, struct printbuf *err)
-{
-	struct bch_sb_field_disk_groups *groups =
-		field_to_type(f, disk_groups);
-	struct bch_disk_group *g, *sorted = NULL;
-	unsigned nr_groups = disk_groups_nr(groups);
-	unsigned i, len;
-	int ret = 0;
-
-	for (i = 0; i < sb->nr_devices; i++) {
-		struct bch_member m = bch2_sb_member_get(sb, i);
-		unsigned group_id;
-
-		if (!BCH_MEMBER_GROUP(&m))
-			continue;
-
-		group_id = BCH_MEMBER_GROUP(&m) - 1;
-
-		if (group_id >= nr_groups) {
-			prt_printf(err, "disk %u has invalid label %u (have %u)",
-				   i, group_id, nr_groups);
-			return -BCH_ERR_invalid_sb_disk_groups;
-		}
-
-		if (BCH_GROUP_DELETED(&groups->entries[group_id])) {
-			prt_printf(err, "disk %u has deleted label %u", i, group_id);
-			return -BCH_ERR_invalid_sb_disk_groups;
-		}
-	}
-
-	if (!nr_groups)
-		return 0;
-
-	for (i = 0; i < nr_groups; i++) {
-		g = groups->entries + i;
-
-		if (BCH_GROUP_DELETED(g))
-			continue;
-
-		len = strnlen(g->label, sizeof(g->label));
-		if (!len) {
-			prt_printf(err, "label %u empty", i);
-			return -BCH_ERR_invalid_sb_disk_groups;
-		}
-	}
-
-	sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
-	if (!sorted)
-		return -BCH_ERR_ENOMEM_disk_groups_validate;
-
-	memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
-	sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
-
-	for (g = sorted; g + 1 < sorted + nr_groups; g++)
-		if (!BCH_GROUP_DELETED(g) &&
-		    !group_cmp(&g[0], &g[1])) {
-			prt_printf(err, "duplicate label %llu.%.*s",
-			       BCH_GROUP_PARENT(g),
-			       (int) sizeof(g->label), g->label);
-			ret = -BCH_ERR_invalid_sb_disk_groups;
-			goto err;
-		}
-err:
-	kfree(sorted);
-	return ret;
-}
-
-void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	out->atomic++;
-	rcu_read_lock();
-
-	struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
-	if (!g)
-		goto out;
-
-	for (unsigned i = 0; i < g->nr; i++) {
-		if (i)
-			prt_printf(out, " ");
-
-		if (g->entries[i].deleted) {
-			prt_printf(out, "[deleted]");
-			continue;
-		}
-
-		prt_printf(out, "[parent %d devs", g->entries[i].parent);
-		for_each_member_device_rcu(c, ca, &g->entries[i].devs)
-			prt_printf(out, " %s", ca->name);
-		prt_printf(out, "]");
-	}
-
-out:
-	rcu_read_unlock();
-	out->atomic--;
-}
-
-static void bch2_sb_disk_groups_to_text(struct printbuf *out,
-					struct bch_sb *sb,
-					struct bch_sb_field *f)
-{
-	struct bch_sb_field_disk_groups *groups =
-		field_to_type(f, disk_groups);
-	struct bch_disk_group *g;
-	unsigned nr_groups = disk_groups_nr(groups);
-
-	for (g = groups->entries;
-	     g < groups->entries + nr_groups;
-	     g++) {
-		if (g != groups->entries)
-			prt_printf(out, " ");
-
-		if (BCH_GROUP_DELETED(g))
-			prt_printf(out, "[deleted]");
-		else
-			prt_printf(out, "[parent %llu name %s]",
-			       BCH_GROUP_PARENT(g), g->label);
-	}
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
-	.validate	= bch2_sb_disk_groups_validate,
-	.to_text	= bch2_sb_disk_groups_to_text
-};
-
-int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
-{
-	struct bch_sb_field_disk_groups *groups;
-	struct bch_disk_groups_cpu *cpu_g, *old_g;
-	unsigned i, g, nr_groups;
-
-	lockdep_assert_held(&c->sb_lock);
-
-	groups		= bch2_sb_field_get(c->disk_sb.sb, disk_groups);
-	nr_groups	= disk_groups_nr(groups);
-
-	if (!groups)
-		return 0;
-
-	cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL);
-	if (!cpu_g)
-		return -BCH_ERR_ENOMEM_disk_groups_to_cpu;
-
-	cpu_g->nr = nr_groups;
-
-	for (i = 0; i < nr_groups; i++) {
-		struct bch_disk_group *src	= &groups->entries[i];
-		struct bch_disk_group_cpu *dst	= &cpu_g->entries[i];
-
-		dst->deleted	= BCH_GROUP_DELETED(src);
-		dst->parent	= BCH_GROUP_PARENT(src);
-		memcpy(dst->label, src->label, sizeof(dst->label));
-	}
-
-	for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
-		struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i);
-		struct bch_disk_group_cpu *dst;
-
-		if (!bch2_member_alive(&m))
-			continue;
-
-		g = BCH_MEMBER_GROUP(&m);
-		while (g) {
-			dst = &cpu_g->entries[g - 1];
-			__set_bit(i, dst->devs.d);
-			g = dst->parent;
-		}
-	}
-
-	old_g = rcu_dereference_protected(c->disk_groups,
-				lockdep_is_held(&c->sb_lock));
-	rcu_assign_pointer(c->disk_groups, cpu_g);
-	if (old_g)
-		kfree_rcu(old_g, rcu);
-
-	return 0;
-}
-
-const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
-{
-	struct target t = target_decode(target);
-	struct bch_devs_mask *devs;
-
-	rcu_read_lock();
-
-	switch (t.type) {
-	case TARGET_NULL:
-		devs = NULL;
-		break;
-	case TARGET_DEV: {
-		struct bch_dev *ca = t.dev < c->sb.nr_devices
-			? rcu_dereference(c->devs[t.dev])
-			: NULL;
-		devs = ca ? &ca->self : NULL;
-		break;
-	}
-	case TARGET_GROUP: {
-		struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
-
-		devs = g && t.group < g->nr && !g->entries[t.group].deleted
-			? &g->entries[t.group].devs
-			: NULL;
-		break;
-	}
-	default:
-		BUG();
-	}
-
-	rcu_read_unlock();
-
-	return devs;
-}
-
-bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
-{
-	struct target t = target_decode(target);
-
-	switch (t.type) {
-	case TARGET_NULL:
-		return false;
-	case TARGET_DEV:
-		return dev == t.dev;
-	case TARGET_GROUP: {
-		struct bch_disk_groups_cpu *g;
-		const struct bch_devs_mask *m;
-		bool ret;
-
-		rcu_read_lock();
-		g = rcu_dereference(c->disk_groups);
-		m = g && t.group < g->nr && !g->entries[t.group].deleted
-			? &g->entries[t.group].devs
-			: NULL;
-
-		ret = m ? test_bit(dev, m->d) : false;
-		rcu_read_unlock();
-
-		return ret;
-	}
-	default:
-		BUG();
-	}
-}
-
-static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
-				  unsigned parent,
-				  const char *name, unsigned namelen)
-{
-	unsigned i, nr_groups = disk_groups_nr(groups);
-
-	if (!namelen || namelen > BCH_SB_LABEL_SIZE)
-		return -EINVAL;
-
-	for (i = 0; i < nr_groups; i++) {
-		struct bch_disk_group *g = groups->entries + i;
-
-		if (BCH_GROUP_DELETED(g))
-			continue;
-
-		if (!BCH_GROUP_DELETED(g) &&
-		    BCH_GROUP_PARENT(g) == parent &&
-		    strnlen(g->label, sizeof(g->label)) == namelen &&
-		    !memcmp(name, g->label, namelen))
-			return i;
-	}
-
-	return -1;
-}
-
-static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
-				 const char *name, unsigned namelen)
-{
-	struct bch_sb_field_disk_groups *groups =
-		bch2_sb_field_get(sb->sb, disk_groups);
-	unsigned i, nr_groups = disk_groups_nr(groups);
-	struct bch_disk_group *g;
-
-	if (!namelen || namelen > BCH_SB_LABEL_SIZE)
-		return -EINVAL;
-
-	for (i = 0;
-	     i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
-	     i++)
-		;
-
-	if (i == nr_groups) {
-		unsigned u64s =
-			(sizeof(struct bch_sb_field_disk_groups) +
-			 sizeof(struct bch_disk_group) * (nr_groups + 1)) /
-			sizeof(u64);
-
-		groups = bch2_sb_field_resize(sb, disk_groups, u64s);
-		if (!groups)
-			return -BCH_ERR_ENOSPC_disk_label_add;
-
-		nr_groups = disk_groups_nr(groups);
-	}
-
-	BUG_ON(i >= nr_groups);
-
-	g = &groups->entries[i];
-
-	memcpy(g->label, name, namelen);
-	if (namelen < sizeof(g->label))
-		g->label[namelen] = '\0';
-	SET_BCH_GROUP_DELETED(g, 0);
-	SET_BCH_GROUP_PARENT(g, parent);
-	SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
-
-	return i;
-}
-
-int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name)
-{
-	struct bch_sb_field_disk_groups *groups =
-		bch2_sb_field_get(sb->sb, disk_groups);
-	int v = -1;
-
-	do {
-		const char *next = strchrnul(name, '.');
-		unsigned len = next - name;
-
-		if (*next == '.')
-			next++;
-
-		v = __bch2_disk_group_find(groups, v + 1, name, len);
-		name = next;
-	} while (*name && v >= 0);
-
-	return v;
-}
-
-int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
-{
-	struct bch_sb_field_disk_groups *groups;
-	unsigned parent = 0;
-	int v = -1;
-
-	do {
-		const char *next = strchrnul(name, '.');
-		unsigned len = next - name;
-
-		if (*next == '.')
-			next++;
-
-		groups = bch2_sb_field_get(sb->sb, disk_groups);
-
-		v = __bch2_disk_group_find(groups, parent, name, len);
-		if (v < 0)
-			v = __bch2_disk_group_add(sb, parent, name, len);
-		if (v < 0)
-			return v;
-
-		parent = v + 1;
-		name = next;
-	} while (*name && v >= 0);
-
-	return v;
-}
-
-void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
-{
-	struct bch_disk_groups_cpu *groups;
-	struct bch_disk_group_cpu *g;
-	unsigned nr = 0;
-	u16 path[32];
-
-	out->atomic++;
-	rcu_read_lock();
-	groups = rcu_dereference(c->disk_groups);
-	if (!groups)
-		goto invalid;
-
-	while (1) {
-		if (nr == ARRAY_SIZE(path))
-			goto invalid;
-
-		if (v >= groups->nr)
-			goto invalid;
-
-		g = groups->entries + v;
-
-		if (g->deleted)
-			goto invalid;
-
-		path[nr++] = v;
-
-		if (!g->parent)
-			break;
-
-		v = g->parent - 1;
-	}
-
-	while (nr) {
-		v = path[--nr];
-		g = groups->entries + v;
-
-		prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
-		if (nr)
-			prt_printf(out, ".");
-	}
-out:
-	rcu_read_unlock();
-	out->atomic--;
-	return;
-invalid:
-	prt_printf(out, "invalid label %u", v);
-	goto out;
-}
-
-void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
-{
-	struct bch_sb_field_disk_groups *groups =
-		bch2_sb_field_get(sb, disk_groups);
-	struct bch_disk_group *g;
-	unsigned nr = 0;
-	u16 path[32];
-
-	while (1) {
-		if (nr == ARRAY_SIZE(path))
-			goto inval;
-
-		if (v >= disk_groups_nr(groups))
-			goto inval;
-
-		g = groups->entries + v;
-
-		if (BCH_GROUP_DELETED(g))
-			goto inval;
-
-		path[nr++] = v;
-
-		if (!BCH_GROUP_PARENT(g))
-			break;
-
-		v = BCH_GROUP_PARENT(g) - 1;
-	}
-
-	while (nr) {
-		v = path[--nr];
-		g = groups->entries + v;
-
-		prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
-		if (nr)
-			prt_printf(out, ".");
-	}
-	return;
-inval:
-	prt_printf(out, "invalid label %u", v);
-}
-
-int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
-{
-	struct bch_member *mi;
-	int ret, v = -1;
-
-	if (!strlen(name) || !strcmp(name, "none"))
-		return 0;
-
-	v = bch2_disk_path_find_or_create(&c->disk_sb, name);
-	if (v < 0)
-		return v;
-
-	ret = bch2_sb_disk_groups_to_cpu(c);
-	if (ret)
-		return ret;
-
-	mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-	SET_BCH_MEMBER_GROUP(mi, v + 1);
-	return 0;
-}
-
-int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
-{
-	int ret;
-
-	mutex_lock(&c->sb_lock);
-	ret = __bch2_dev_group_set(c, ca, name) ?:
-		bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-
-	return ret;
-}
-
-int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,
-			  struct printbuf *err)
-{
-	struct bch_dev *ca;
-	int g;
-
-	if (!val)
-		return -EINVAL;
-
-	if (!c)
-		return 0;
-
-	if (!strlen(val) || !strcmp(val, "none")) {
-		*res = 0;
-		return 0;
-	}
-
-	/* Is it a device? */
-	ca = bch2_dev_lookup(c, val);
-	if (!IS_ERR(ca)) {
-		*res = dev_to_target(ca->dev_idx);
-		bch2_dev_put(ca);
-		return 0;
-	}
-
-	mutex_lock(&c->sb_lock);
-	g = bch2_disk_path_find(&c->disk_sb, val);
-	mutex_unlock(&c->sb_lock);
-
-	if (g >= 0) {
-		*res = group_to_target(g);
-		return 0;
-	}
-
-	return -EINVAL;
-}
-
-void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
-{
-	struct target t = target_decode(v);
-
-	switch (t.type) {
-	case TARGET_NULL:
-		prt_printf(out, "none");
-		break;
-	case TARGET_DEV: {
-		struct bch_dev *ca;
-
-		out->atomic++;
-		rcu_read_lock();
-		ca = t.dev < c->sb.nr_devices
-			? rcu_dereference(c->devs[t.dev])
-			: NULL;
-
-		if (ca && percpu_ref_tryget(&ca->io_ref)) {
-			prt_printf(out, "/dev/%s", ca->name);
-			percpu_ref_put(&ca->io_ref);
-		} else if (ca) {
-			prt_printf(out, "offline device %u", t.dev);
-		} else {
-			prt_printf(out, "invalid device %u", t.dev);
-		}
-
-		rcu_read_unlock();
-		out->atomic--;
-		break;
-	}
-	case TARGET_GROUP:
-		bch2_disk_path_to_text(out, c, t.group);
-		break;
-	default:
-		BUG();
-	}
-}
-
-static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
-{
-	struct target t = target_decode(v);
-
-	switch (t.type) {
-	case TARGET_NULL:
-		prt_printf(out, "none");
-		break;
-	case TARGET_DEV: {
-		struct bch_member m = bch2_sb_member_get(sb, t.dev);
-
-		if (bch2_member_exists(sb, t.dev)) {
-			prt_printf(out, "Device ");
-			pr_uuid(out, m.uuid.b);
-			prt_printf(out, " (%u)", t.dev);
-		} else {
-			prt_printf(out, "Bad device %u", t.dev);
-		}
-		break;
-	}
-	case TARGET_GROUP:
-		bch2_disk_path_to_text_sb(out, sb, t.group);
-		break;
-	default:
-		BUG();
-	}
-}
-
-void bch2_opt_target_to_text(struct printbuf *out,
-			     struct bch_fs *c,
-			     struct bch_sb *sb,
-			     u64 v)
-{
-	if (c)
-		bch2_target_to_text(out, c, v);
-	else
-		bch2_target_to_text_sb(out, sb, v);
-}
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
deleted file mode 100644
index 441826fff224..000000000000
--- a/fs/bcachefs/disk_groups.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_GROUPS_H
-#define _BCACHEFS_DISK_GROUPS_H
-
-#include "disk_groups_types.h"
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
-
-static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
-{
-	return groups
-		? (vstruct_end(&groups->field) -
-		   (void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
-		: 0;
-}
-
-struct target {
-	enum {
-		TARGET_NULL,
-		TARGET_DEV,
-		TARGET_GROUP,
-	}			type;
-	union {
-		unsigned	dev;
-		unsigned	group;
-	};
-};
-
-#define TARGET_DEV_START	1
-#define TARGET_GROUP_START	(256 + TARGET_DEV_START)
-
-static inline u16 dev_to_target(unsigned dev)
-{
-	return TARGET_DEV_START + dev;
-}
-
-static inline u16 group_to_target(unsigned group)
-{
-	return TARGET_GROUP_START + group;
-}
-
-static inline struct target target_decode(unsigned target)
-{
-	if (target >= TARGET_GROUP_START)
-		return (struct target) {
-			.type	= TARGET_GROUP,
-			.group	= target - TARGET_GROUP_START
-		};
-
-	if (target >= TARGET_DEV_START)
-		return (struct target) {
-			.type	= TARGET_DEV,
-			.group	= target - TARGET_DEV_START
-		};
-
-	return (struct target) { .type = TARGET_NULL };
-}
-
-const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
-
-static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c,
-						  enum bch_data_type data_type,
-						  u16 target)
-{
-	struct bch_devs_mask devs = c->rw_devs[data_type];
-	const struct bch_devs_mask *t = bch2_target_to_mask(c, target);
-
-	if (t)
-		bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
-	return devs;
-}
-
-static inline bool bch2_target_accepts_data(struct bch_fs *c,
-					    enum bch_data_type data_type,
-					    u16 target)
-{
-	struct bch_devs_mask rw_devs = target_rw_devs(c, data_type, target);
-	return !bitmap_empty(rw_devs.d, BCH_SB_MEMBERS_MAX);
-}
-
-bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
-
-int bch2_disk_path_find(struct bch_sb_handle *, const char *);
-
-/* Exported for userspace bcachefs-tools: */
-int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
-
-void bch2_disk_path_to_text(struct printbuf *, struct bch_fs *, unsigned);
-void bch2_disk_path_to_text_sb(struct printbuf *, struct bch_sb *, unsigned);
-
-void bch2_target_to_text(struct printbuf *out, struct bch_fs *, unsigned);
-
-int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
-void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
-
-#define bch2_opt_target (struct bch_opt_fn) {		\
-	.parse		= bch2_opt_target_parse,	\
-	.to_text	= bch2_opt_target_to_text,	\
-}
-
-int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
-
-int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
-int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
-
-const char *bch2_sb_validate_disk_groups(struct bch_sb *,
-					 struct bch_sb_field *);
-
-void bch2_disk_groups_to_text(struct printbuf *, struct bch_fs *);
-
-#endif /* _BCACHEFS_DISK_GROUPS_H */
diff --git a/fs/bcachefs/disk_groups_types.h b/fs/bcachefs/disk_groups_types.h
deleted file mode 100644
index a54ef085b13d..000000000000
--- a/fs/bcachefs/disk_groups_types.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_GROUPS_TYPES_H
-#define _BCACHEFS_DISK_GROUPS_TYPES_H
-
-struct bch_disk_group_cpu {
-	bool				deleted;
-	u16				parent;
-	u8				label[BCH_SB_LABEL_SIZE];
-	struct bch_devs_mask		devs;
-};
-
-struct bch_disk_groups_cpu {
-	struct rcu_head			rcu;
-	unsigned			nr;
-	struct bch_disk_group_cpu	entries[] __counted_by(nr);
-};
-
-#endif /* _BCACHEFS_DISK_GROUPS_TYPES_H */
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
deleted file mode 100644
index b26dc7424662..000000000000
--- a/fs/bcachefs/ec.c
+++ /dev/null
@@ -1,2290 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/* erasure coding */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
-#include "bset.h"
-#include "btree_gc.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "error.h"
-#include "io_read.h"
-#include "keylist.h"
-#include "recovery.h"
-#include "replicas.h"
-#include "super-io.h"
-#include "util.h"
-
-#include <linux/sort.h>
-
-#ifdef __KERNEL__
-
-#include <linux/raid/pq.h>
-#include <linux/raid/xor.h>
-
-static void raid5_recov(unsigned disks, unsigned failed_idx,
-			size_t size, void **data)
-{
-	unsigned i = 2, nr;
-
-	BUG_ON(failed_idx >= disks);
-
-	swap(data[0], data[failed_idx]);
-	memcpy(data[0], data[1], size);
-
-	while (i < disks) {
-		nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
-		xor_blocks(nr, size, data[0], data + i);
-		i += nr;
-	}
-
-	swap(data[0], data[failed_idx]);
-}
-
-static void raid_gen(int nd, int np, size_t size, void **v)
-{
-	if (np >= 1)
-		raid5_recov(nd + np, nd, size, v);
-	if (np >= 2)
-		raid6_call.gen_syndrome(nd + np, size, v);
-	BUG_ON(np > 2);
-}
-
-static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
-{
-	switch (nr) {
-	case 0:
-		break;
-	case 1:
-		if (ir[0] < nd + 1)
-			raid5_recov(nd + 1, ir[0], size, v);
-		else
-			raid6_call.gen_syndrome(nd + np, size, v);
-		break;
-	case 2:
-		if (ir[1] < nd) {
-			/* data+data failure. */
-			raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
-		} else if (ir[0] < nd) {
-			/* data + p/q failure */
-
-			if (ir[1] == nd) /* data + p failure */
-				raid6_datap_recov(nd + np, size, ir[0], v);
-			else { /* data + q failure */
-				raid5_recov(nd + 1, ir[0], size, v);
-				raid6_call.gen_syndrome(nd + np, size, v);
-			}
-		} else {
-			raid_gen(nd, np, size, v);
-		}
-		break;
-	default:
-		BUG();
-	}
-}
-
-#else
-
-#include <raid/raid.h>
-
-#endif
-
-struct ec_bio {
-	struct bch_dev		*ca;
-	struct ec_stripe_buf	*buf;
-	size_t			idx;
-	struct bio		bio;
-};
-
-/* Stripes btree keys: */
-
-int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k,
-			enum bch_validate_flags flags,
-			struct printbuf *err)
-{
-	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
-	int ret = 0;
-
-	bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) ||
-			 bpos_gt(k.k->p, POS(0, U32_MAX)), c, err,
-			 stripe_pos_bad,
-			 "stripe at bad pos");
-
-	bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err,
-			 stripe_val_size_bad,
-			 "incorrect value size (%zu < %u)",
-			 bkey_val_u64s(k.k), stripe_val_u64s(s));
-
-	ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
-fsck_err:
-	return ret;
-}
-
-void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
-			 struct bkey_s_c k)
-{
-	const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v;
-	struct bch_stripe s = {};
-
-	memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k)));
-
-	unsigned nr_data = s.nr_blocks - s.nr_redundant;
-
-	prt_printf(out, "algo %u sectors %u blocks %u:%u csum ",
-		   s.algorithm,
-		   le16_to_cpu(s.sectors),
-		   nr_data,
-		   s.nr_redundant);
-	bch2_prt_csum_type(out, s.csum_type);
-	prt_printf(out, " gran %u", 1U << s.csum_granularity_bits);
-
-	for (unsigned i = 0; i < s.nr_blocks; i++) {
-		const struct bch_extent_ptr *ptr = sp->ptrs + i;
-
-		if ((void *) ptr >= bkey_val_end(k))
-			break;
-
-		bch2_extent_ptr_to_text(out, c, ptr);
-
-		if (s.csum_type < BCH_CSUM_NR &&
-		    i < nr_data &&
-		    stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k))
-			prt_printf(out,  "#%u", stripe_blockcount_get(sp, i));
-	}
-}
-
-/* Triggers: */
-
-static int __mark_stripe_bucket(struct btree_trans *trans,
-				struct bch_dev *ca,
-				struct bkey_s_c_stripe s,
-				unsigned ptr_idx, bool deleting,
-				struct bpos bucket,
-				struct bch_alloc_v4 *a,
-				enum btree_iter_update_trigger_flags flags)
-{
-	const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
-	unsigned nr_data = s.v->nr_blocks - s.v->nr_redundant;
-	bool parity = ptr_idx >= nr_data;
-	enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
-	s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	struct bch_fs *c = trans->c;
-	if (deleting)
-		sectors = -sectors;
-
-	if (!deleting) {
-		if (bch2_trans_inconsistent_on(a->stripe ||
-					       a->stripe_redundancy, trans,
-				"bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)\n%s",
-				bucket.inode, bucket.offset, a->gen,
-				bch2_data_type_str(a->data_type),
-				a->dirty_sectors,
-				a->stripe, s.k->p.offset,
-				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-			ret = -EIO;
-			goto err;
-		}
-
-		if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans,
-				"bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s",
-				bucket.inode, bucket.offset, a->gen,
-				bch2_data_type_str(a->data_type),
-				a->dirty_sectors,
-				a->cached_sectors,
-				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-			ret = -EIO;
-			goto err;
-		}
-	} else {
-		if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset ||
-					       a->stripe_redundancy != s.v->nr_redundant, trans,
-				"bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s",
-				bucket.inode, bucket.offset, a->gen,
-				a->stripe,
-				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-			ret = -EIO;
-			goto err;
-		}
-
-		if (bch2_trans_inconsistent_on(a->data_type != data_type, trans,
-				"bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s",
-				bucket.inode, bucket.offset, a->gen,
-				bch2_data_type_str(a->data_type),
-				bch2_data_type_str(data_type),
-				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-			ret = -EIO;
-			goto err;
-		}
-
-		if (bch2_trans_inconsistent_on(parity &&
-					       (a->dirty_sectors != -sectors ||
-						a->cached_sectors), trans,
-				"bucket %llu:%llu gen %u dirty_sectors %u cached_sectors %u: wrong sectors when deleting parity block of stripe\n%s",
-				bucket.inode, bucket.offset, a->gen,
-				a->dirty_sectors,
-				a->cached_sectors,
-				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-			ret = -EIO;
-			goto err;
-		}
-	}
-
-	if (sectors) {
-		ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type,
-					     a->gen, a->data_type, &a->dirty_sectors);
-		if (ret)
-			goto err;
-	}
-
-	if (!deleting) {
-		a->stripe		= s.k->p.offset;
-		a->stripe_redundancy	= s.v->nr_redundant;
-	} else {
-		a->stripe		= 0;
-		a->stripe_redundancy	= 0;
-	}
-
-	alloc_data_type_set(a, data_type);
-err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int mark_stripe_bucket(struct btree_trans *trans,
-			      struct bkey_s_c_stripe s,
-			      unsigned ptr_idx, bool deleting,
-			      enum btree_iter_update_trigger_flags flags)
-{
-	struct bch_fs *c = trans->c;
-	const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
-	int ret = 0;
-
-	struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
-	if (unlikely(!ca)) {
-		if (!(flags & BTREE_TRIGGER_overwrite))
-			ret = -EIO;
-		goto err;
-	}
-
-	struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
-	if (flags & BTREE_TRIGGER_transactional) {
-		struct bkey_i_alloc_v4 *a =
-			bch2_trans_start_alloc_update(trans, bucket);
-		ret = PTR_ERR_OR_ZERO(a) ?:
-			__mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags);
-	}
-
-	if (flags & BTREE_TRIGGER_gc) {
-		percpu_down_read(&c->mark_lock);
-		struct bucket *g = gc_bucket(ca, bucket.offset);
-		bucket_lock(g);
-		struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
-		ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
-		if (!ret) {
-			alloc_to_bucket(g, new);
-			bch2_dev_usage_update(c, ca, &old, &new, 0, true);
-		}
-		bucket_unlock(g);
-		percpu_up_read(&c->mark_lock);
-	}
-err:
-	bch2_dev_put(ca);
-	return ret;
-}
-
-static int mark_stripe_buckets(struct btree_trans *trans,
-			       struct bkey_s_c old, struct bkey_s_c new,
-			       enum btree_iter_update_trigger_flags flags)
-{
-	const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
-		? bkey_s_c_to_stripe(old).v : NULL;
-	const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
-		? bkey_s_c_to_stripe(new).v : NULL;
-
-	BUG_ON(old_s && new_s && old_s->nr_blocks != new_s->nr_blocks);
-
-	unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
-
-	for (unsigned i = 0; i < nr_blocks; i++) {
-		if (new_s && old_s &&
-		    !memcmp(&new_s->ptrs[i],
-			    &old_s->ptrs[i],
-			    sizeof(new_s->ptrs[i])))
-			continue;
-
-		if (new_s) {
-			int ret = mark_stripe_bucket(trans,
-					bkey_s_c_to_stripe(new), i, false, flags);
-			if (ret)
-				return ret;
-		}
-
-		if (old_s) {
-			int ret = mark_stripe_bucket(trans,
-					bkey_s_c_to_stripe(old), i, true, flags);
-			if (ret)
-				return ret;
-		}
-	}
-
-	return 0;
-}
-
-int bch2_trigger_stripe(struct btree_trans *trans,
-			enum btree_id btree, unsigned level,
-			struct bkey_s_c old, struct bkey_s _new,
-			enum btree_iter_update_trigger_flags flags)
-{
-	struct bkey_s_c new = _new.s_c;
-	struct bch_fs *c = trans->c;
-	u64 idx = new.k->p.offset;
-	const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
-		? bkey_s_c_to_stripe(old).v : NULL;
-	const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
-		? bkey_s_c_to_stripe(new).v : NULL;
-
-	if (unlikely(flags & BTREE_TRIGGER_check_repair))
-		return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags);
-
-	if (flags & BTREE_TRIGGER_transactional) {
-		/*
-		 * If the pointers aren't changing, we don't need to do anything:
-		 */
-		if (new_s && old_s &&
-		    new_s->nr_blocks	== old_s->nr_blocks &&
-		    new_s->nr_redundant	== old_s->nr_redundant &&
-		    !memcmp(old_s->ptrs, new_s->ptrs,
-			    new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
-			return 0;
-
-		BUG_ON(new_s && old_s &&
-		       (new_s->nr_blocks	!= old_s->nr_blocks ||
-			new_s->nr_redundant	!= old_s->nr_redundant));
-
-		if (new_s) {
-			s64 sectors = le16_to_cpu(new_s->sectors);
-
-			struct bch_replicas_padded r;
-			bch2_bkey_to_replicas(&r.e, new);
-			int ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
-			if (ret)
-				return ret;
-		}
-
-		if (old_s) {
-			s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
-
-			struct bch_replicas_padded r;
-			bch2_bkey_to_replicas(&r.e, old);
-			int ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
-			if (ret)
-				return ret;
-		}
-
-		int ret = mark_stripe_buckets(trans, old, new, flags);
-		if (ret)
-			return ret;
-	}
-
-	if (flags & BTREE_TRIGGER_atomic) {
-		struct stripe *m = genradix_ptr(&c->stripes, idx);
-
-		if (!m) {
-			struct printbuf buf1 = PRINTBUF;
-			struct printbuf buf2 = PRINTBUF;
-
-			bch2_bkey_val_to_text(&buf1, c, old);
-			bch2_bkey_val_to_text(&buf2, c, new);
-			bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
-					    "old %s\n"
-					    "new %s", idx, buf1.buf, buf2.buf);
-			printbuf_exit(&buf2);
-			printbuf_exit(&buf1);
-			bch2_inconsistent_error(c);
-			return -1;
-		}
-
-		if (!new_s) {
-			bch2_stripes_heap_del(c, m, idx);
-
-			memset(m, 0, sizeof(*m));
-		} else {
-			m->sectors	= le16_to_cpu(new_s->sectors);
-			m->algorithm	= new_s->algorithm;
-			m->nr_blocks	= new_s->nr_blocks;
-			m->nr_redundant	= new_s->nr_redundant;
-			m->blocks_nonempty = 0;
-
-			for (unsigned i = 0; i < new_s->nr_blocks; i++)
-				m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
-
-			if (!old_s)
-				bch2_stripes_heap_insert(c, m, idx);
-			else
-				bch2_stripes_heap_update(c, m, idx);
-		}
-	}
-
-	if (flags & BTREE_TRIGGER_gc) {
-		struct gc_stripe *m =
-			genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
-
-		if (!m) {
-			bch_err(c, "error allocating memory for gc_stripes, idx %llu",
-				idx);
-			return -BCH_ERR_ENOMEM_mark_stripe;
-		}
-		/*
-		 * This will be wrong when we bring back runtime gc: we should
-		 * be unmarking the old key and then marking the new key
-		 */
-		m->alive	= true;
-		m->sectors	= le16_to_cpu(new_s->sectors);
-		m->nr_blocks	= new_s->nr_blocks;
-		m->nr_redundant	= new_s->nr_redundant;
-
-		for (unsigned i = 0; i < new_s->nr_blocks; i++)
-			m->ptrs[i] = new_s->ptrs[i];
-
-		bch2_bkey_to_replicas(&m->r.e, new);
-
-		/*
-		 * gc recalculates this field from stripe ptr
-		 * references:
-		 */
-		memset(m->block_sectors, 0, sizeof(m->block_sectors));
-
-		int ret = mark_stripe_buckets(trans, old, new, flags);
-		if (ret)
-			return ret;
-
-		ret = bch2_update_replicas(c, new, &m->r.e,
-				      ((s64) m->sectors * m->nr_redundant),
-				      0, true);
-		if (ret) {
-			struct printbuf buf = PRINTBUF;
-
-			bch2_bkey_val_to_text(&buf, c, new);
-			bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf);
-			printbuf_exit(&buf);
-			return ret;
-		}
-	}
-
-	return 0;
-}
-
-/* returns blocknr in stripe that we matched: */
-static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
-						struct bkey_s_c k, unsigned *block)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
-
-	bkey_for_each_ptr(ptrs, ptr)
-		for (i = 0; i < nr_data; i++)
-			if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
-						      le16_to_cpu(s->sectors))) {
-				*block = i;
-				return ptr;
-			}
-
-	return NULL;
-}
-
-static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
-{
-	switch (k.k->type) {
-	case KEY_TYPE_extent: {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-		const union bch_extent_entry *entry;
-
-		extent_for_each_entry(e, entry)
-			if (extent_entry_type(entry) ==
-			    BCH_EXTENT_ENTRY_stripe_ptr &&
-			    entry->stripe_ptr.idx == idx)
-				return true;
-
-		break;
-	}
-	}
-
-	return false;
-}
-
-/* Stripe bufs: */
-
-static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
-{
-	if (buf->key.k.type == KEY_TYPE_stripe) {
-		struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key);
-		unsigned i;
-
-		for (i = 0; i < s->v.nr_blocks; i++) {
-			kvfree(buf->data[i]);
-			buf->data[i] = NULL;
-		}
-	}
-}
-
-/* XXX: this is a non-mempoolified memory allocation: */
-static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
-			      unsigned offset, unsigned size)
-{
-	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-	unsigned csum_granularity = 1U << v->csum_granularity_bits;
-	unsigned end = offset + size;
-	unsigned i;
-
-	BUG_ON(end > le16_to_cpu(v->sectors));
-
-	offset	= round_down(offset, csum_granularity);
-	end	= min_t(unsigned, le16_to_cpu(v->sectors),
-			round_up(end, csum_granularity));
-
-	buf->offset	= offset;
-	buf->size	= end - offset;
-
-	memset(buf->valid, 0xFF, sizeof(buf->valid));
-
-	for (i = 0; i < v->nr_blocks; i++) {
-		buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL);
-		if (!buf->data[i])
-			goto err;
-	}
-
-	return 0;
-err:
-	ec_stripe_buf_exit(buf);
-	return -BCH_ERR_ENOMEM_stripe_buf;
-}
-
-/* Checksumming: */
-
-static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
-					 unsigned block, unsigned offset)
-{
-	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-	unsigned csum_granularity = 1 << v->csum_granularity_bits;
-	unsigned end = buf->offset + buf->size;
-	unsigned len = min(csum_granularity, end - offset);
-
-	BUG_ON(offset >= end);
-	BUG_ON(offset <  buf->offset);
-	BUG_ON(offset & (csum_granularity - 1));
-	BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
-	       (len & (csum_granularity - 1)));
-
-	return bch2_checksum(NULL, v->csum_type,
-			     null_nonce(),
-			     buf->data[block] + ((offset - buf->offset) << 9),
-			     len << 9);
-}
-
-static void ec_generate_checksums(struct ec_stripe_buf *buf)
-{
-	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-	unsigned i, j, csums_per_device = stripe_csums_per_device(v);
-
-	if (!v->csum_type)
-		return;
-
-	BUG_ON(buf->offset);
-	BUG_ON(buf->size != le16_to_cpu(v->sectors));
-
-	for (i = 0; i < v->nr_blocks; i++)
-		for (j = 0; j < csums_per_device; j++)
-			stripe_csum_set(v, i, j,
-				ec_block_checksum(buf, i, j << v->csum_granularity_bits));
-}
-
-static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
-{
-	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-	unsigned csum_granularity = 1 << v->csum_granularity_bits;
-	unsigned i;
-
-	if (!v->csum_type)
-		return;
-
-	for (i = 0; i < v->nr_blocks; i++) {
-		unsigned offset = buf->offset;
-		unsigned end = buf->offset + buf->size;
-
-		if (!test_bit(i, buf->valid))
-			continue;
-
-		while (offset < end) {
-			unsigned j = offset >> v->csum_granularity_bits;
-			unsigned len = min(csum_granularity, end - offset);
-			struct bch_csum want = stripe_csum_get(v, i, j);
-			struct bch_csum got = ec_block_checksum(buf, i, offset);
-
-			if (bch2_crc_cmp(want, got)) {
-				struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev);
-				if (ca) {
-					struct printbuf err = PRINTBUF;
-
-					prt_str(&err, "stripe ");
-					bch2_csum_err_msg(&err, v->csum_type, want, got);
-					prt_printf(&err, "  for %ps at %u of\n  ", (void *) _RET_IP_, i);
-					bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
-					bch_err_ratelimited(ca, "%s", err.buf);
-					printbuf_exit(&err);
-
-					bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
-				}
-
-				clear_bit(i, buf->valid);
-				break;
-			}
-
-			offset += len;
-		}
-	}
-}
-
-/* Erasure coding: */
-
-static void ec_generate_ec(struct ec_stripe_buf *buf)
-{
-	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-	unsigned nr_data = v->nr_blocks - v->nr_redundant;
-	unsigned bytes = le16_to_cpu(v->sectors) << 9;
-
-	raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
-}
-
-static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
-{
-	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-
-	return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks);
-}
-
-static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
-{
-	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-	unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
-	unsigned nr_data = v->nr_blocks - v->nr_redundant;
-	unsigned bytes = buf->size << 9;
-
-	if (ec_nr_failed(buf) > v->nr_redundant) {
-		bch_err_ratelimited(c,
-			"error doing reconstruct read: unable to read enough blocks");
-		return -1;
-	}
-
-	for (i = 0; i < nr_data; i++)
-		if (!test_bit(i, buf->valid))
-			failed[nr_failed++] = i;
-
-	raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
-	return 0;
-}
-
-/* IO: */
-
-static void ec_block_endio(struct bio *bio)
-{
-	struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
-	struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v;
-	struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
-	struct bch_dev *ca = ec_bio->ca;
-	struct closure *cl = bio->bi_private;
-
-	if (bch2_dev_io_err_on(bio->bi_status, ca,
-			       bio_data_dir(bio)
-			       ? BCH_MEMBER_ERROR_write
-			       : BCH_MEMBER_ERROR_read,
-			       "erasure coding %s error: %s",
-			       bio_data_dir(bio) ? "write" : "read",
-			       bch2_blk_status_to_str(bio->bi_status)))
-		clear_bit(ec_bio->idx, ec_bio->buf->valid);
-
-	if (dev_ptr_stale(ca, ptr)) {
-		bch_err_ratelimited(ca->fs,
-				    "error %s stripe: stale pointer after io",
-				    bio_data_dir(bio) == READ ? "reading from" : "writing to");
-		clear_bit(ec_bio->idx, ec_bio->buf->valid);
-	}
-
-	bio_put(&ec_bio->bio);
-	percpu_ref_put(&ca->io_ref);
-	closure_put(cl);
-}
-
-static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
-			blk_opf_t opf, unsigned idx, struct closure *cl)
-{
-	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-	unsigned offset = 0, bytes = buf->size << 9;
-	struct bch_extent_ptr *ptr = &v->ptrs[idx];
-	enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant
-		? BCH_DATA_user
-		: BCH_DATA_parity;
-	int rw = op_is_write(opf);
-
-	struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw);
-	if (!ca) {
-		clear_bit(idx, buf->valid);
-		return;
-	}
-
-	if (dev_ptr_stale(ca, ptr)) {
-		bch_err_ratelimited(c,
-				    "error %s stripe: stale pointer",
-				    rw == READ ? "reading from" : "writing to");
-		clear_bit(idx, buf->valid);
-		return;
-	}
-
-
-	this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
-
-	while (offset < bytes) {
-		unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
-					   DIV_ROUND_UP(bytes, PAGE_SIZE));
-		unsigned b = min_t(size_t, bytes - offset,
-				   nr_iovecs << PAGE_SHIFT);
-		struct ec_bio *ec_bio;
-
-		ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
-						       nr_iovecs,
-						       opf,
-						       GFP_KERNEL,
-						       &c->ec_bioset),
-				      struct ec_bio, bio);
-
-		ec_bio->ca			= ca;
-		ec_bio->buf			= buf;
-		ec_bio->idx			= idx;
-
-		ec_bio->bio.bi_iter.bi_sector	= ptr->offset + buf->offset + (offset >> 9);
-		ec_bio->bio.bi_end_io		= ec_block_endio;
-		ec_bio->bio.bi_private		= cl;
-
-		bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
-
-		closure_get(cl);
-		percpu_ref_get(&ca->io_ref);
-
-		submit_bio(&ec_bio->bio);
-
-		offset += b;
-	}
-
-	percpu_ref_put(&ca->io_ref);
-}
-
-static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
-				struct ec_stripe_buf *stripe)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
-			       POS(0, idx), BTREE_ITER_slots);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-	if (k.k->type != KEY_TYPE_stripe) {
-		ret = -ENOENT;
-		goto err;
-	}
-	bkey_reassemble(&stripe->key, k);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-/* recovery read path: */
-int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
-{
-	struct bch_fs *c = trans->c;
-	struct ec_stripe_buf *buf;
-	struct closure cl;
-	struct bch_stripe *v;
-	unsigned i, offset;
-	int ret = 0;
-
-	closure_init_stack(&cl);
-
-	BUG_ON(!rbio->pick.has_ec);
-
-	buf = kzalloc(sizeof(*buf), GFP_NOFS);
-	if (!buf)
-		return -BCH_ERR_ENOMEM_ec_read_extent;
-
-	ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
-	if (ret) {
-		bch_err_ratelimited(c,
-			"error doing reconstruct read: error %i looking up stripe", ret);
-		kfree(buf);
-		return -EIO;
-	}
-
-	v = &bkey_i_to_stripe(&buf->key)->v;
-
-	if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
-		bch_err_ratelimited(c,
-			"error doing reconstruct read: pointer doesn't match stripe");
-		ret = -EIO;
-		goto err;
-	}
-
-	offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
-	if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
-		bch_err_ratelimited(c,
-			"error doing reconstruct read: read is bigger than stripe");
-		ret = -EIO;
-		goto err;
-	}
-
-	ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
-	if (ret)
-		goto err;
-
-	for (i = 0; i < v->nr_blocks; i++)
-		ec_block_io(c, buf, REQ_OP_READ, i, &cl);
-
-	closure_sync(&cl);
-
-	if (ec_nr_failed(buf) > v->nr_redundant) {
-		bch_err_ratelimited(c,
-			"error doing reconstruct read: unable to read enough blocks");
-		ret = -EIO;
-		goto err;
-	}
-
-	ec_validate_checksums(c, buf);
-
-	ret = ec_do_recov(c, buf);
-	if (ret)
-		goto err;
-
-	memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
-		      buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
-err:
-	ec_stripe_buf_exit(buf);
-	kfree(buf);
-	return ret;
-}
-
-/* stripe bucket accounting: */
-
-static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
-{
-	ec_stripes_heap n, *h = &c->ec_stripes_heap;
-
-	if (idx >= h->size) {
-		if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
-			return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
-
-		mutex_lock(&c->ec_stripes_heap_lock);
-		if (n.size > h->size) {
-			memcpy(n.data, h->data, h->used * sizeof(h->data[0]));
-			n.used = h->used;
-			swap(*h, n);
-		}
-		mutex_unlock(&c->ec_stripes_heap_lock);
-
-		free_heap(&n);
-	}
-
-	if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
-		return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
-
-	if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
-	    !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
-		return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
-
-	return 0;
-}
-
-static int ec_stripe_mem_alloc(struct btree_trans *trans,
-			       struct btree_iter *iter)
-{
-	return allocate_dropping_locks_errcode(trans,
-			__ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp));
-}
-
-/*
- * Hash table of open stripes:
- * Stripes that are being created or modified are kept in a hash table, so that
- * stripe deletion can skip them.
- */
-
-static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx)
-{
-	unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
-	struct ec_stripe_new *s;
-
-	hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash)
-		if (s->idx == idx)
-			return true;
-	return false;
-}
-
-static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx)
-{
-	bool ret = false;
-
-	spin_lock(&c->ec_stripes_new_lock);
-	ret = __bch2_stripe_is_open(c, idx);
-	spin_unlock(&c->ec_stripes_new_lock);
-
-	return ret;
-}
-
-static bool bch2_try_open_stripe(struct bch_fs *c,
-				 struct ec_stripe_new *s,
-				 u64 idx)
-{
-	bool ret;
-
-	spin_lock(&c->ec_stripes_new_lock);
-	ret = !__bch2_stripe_is_open(c, idx);
-	if (ret) {
-		unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
-
-		s->idx = idx;
-		hlist_add_head(&s->hash, &c->ec_stripes_new[hash]);
-	}
-	spin_unlock(&c->ec_stripes_new_lock);
-
-	return ret;
-}
-
-static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
-{
-	BUG_ON(!s->idx);
-
-	spin_lock(&c->ec_stripes_new_lock);
-	hlist_del_init(&s->hash);
-	spin_unlock(&c->ec_stripes_new_lock);
-
-	s->idx = 0;
-}
-
-/* Heap of all existing stripes, ordered by blocks_nonempty */
-
-static u64 stripe_idx_to_delete(struct bch_fs *c)
-{
-	ec_stripes_heap *h = &c->ec_stripes_heap;
-
-	lockdep_assert_held(&c->ec_stripes_heap_lock);
-
-	if (h->used &&
-	    h->data[0].blocks_nonempty == 0 &&
-	    !bch2_stripe_is_open(c, h->data[0].idx))
-		return h->data[0].idx;
-
-	return 0;
-}
-
-static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
-				      struct ec_stripe_heap_entry l,
-				      struct ec_stripe_heap_entry r)
-{
-	return ((l.blocks_nonempty > r.blocks_nonempty) -
-		(l.blocks_nonempty < r.blocks_nonempty));
-}
-
-static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
-						   size_t i)
-{
-	struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
-
-	genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
-}
-
-static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
-{
-	ec_stripes_heap *h = &c->ec_stripes_heap;
-	struct stripe *m = genradix_ptr(&c->stripes, idx);
-
-	BUG_ON(m->heap_idx >= h->used);
-	BUG_ON(h->data[m->heap_idx].idx != idx);
-}
-
-void bch2_stripes_heap_del(struct bch_fs *c,
-			   struct stripe *m, size_t idx)
-{
-	mutex_lock(&c->ec_stripes_heap_lock);
-	heap_verify_backpointer(c, idx);
-
-	heap_del(&c->ec_stripes_heap, m->heap_idx,
-		 ec_stripes_heap_cmp,
-		 ec_stripes_heap_set_backpointer);
-	mutex_unlock(&c->ec_stripes_heap_lock);
-}
-
-void bch2_stripes_heap_insert(struct bch_fs *c,
-			      struct stripe *m, size_t idx)
-{
-	mutex_lock(&c->ec_stripes_heap_lock);
-	BUG_ON(heap_full(&c->ec_stripes_heap));
-
-	heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
-			.idx = idx,
-			.blocks_nonempty = m->blocks_nonempty,
-		}),
-		 ec_stripes_heap_cmp,
-		 ec_stripes_heap_set_backpointer);
-
-	heap_verify_backpointer(c, idx);
-	mutex_unlock(&c->ec_stripes_heap_lock);
-}
-
-void bch2_stripes_heap_update(struct bch_fs *c,
-			      struct stripe *m, size_t idx)
-{
-	ec_stripes_heap *h = &c->ec_stripes_heap;
-	bool do_deletes;
-	size_t i;
-
-	mutex_lock(&c->ec_stripes_heap_lock);
-	heap_verify_backpointer(c, idx);
-
-	h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
-
-	i = m->heap_idx;
-	heap_sift_up(h,	  i, ec_stripes_heap_cmp,
-		     ec_stripes_heap_set_backpointer);
-	heap_sift_down(h, i, ec_stripes_heap_cmp,
-		       ec_stripes_heap_set_backpointer);
-
-	heap_verify_backpointer(c, idx);
-
-	do_deletes = stripe_idx_to_delete(c) != 0;
-	mutex_unlock(&c->ec_stripes_heap_lock);
-
-	if (do_deletes)
-		bch2_do_stripe_deletes(c);
-}
-
-/* stripe deletion */
-
-static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey_s_c_stripe s;
-	int ret;
-
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
-			       BTREE_ITER_intent);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (k.k->type != KEY_TYPE_stripe) {
-		bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx);
-		ret = -EINVAL;
-		goto err;
-	}
-
-	s = bkey_s_c_to_stripe(k);
-	for (unsigned i = 0; i < s.v->nr_blocks; i++)
-		if (stripe_blockcount_get(s.v, i)) {
-			struct printbuf buf = PRINTBUF;
-
-			bch2_bkey_val_to_text(&buf, c, k);
-			bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf);
-			printbuf_exit(&buf);
-			ret = -EINVAL;
-			goto err;
-		}
-
-	ret = bch2_btree_delete_at(trans, &iter, 0);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static void ec_stripe_delete_work(struct work_struct *work)
-{
-	struct bch_fs *c =
-		container_of(work, struct bch_fs, ec_stripe_delete_work);
-
-	while (1) {
-		mutex_lock(&c->ec_stripes_heap_lock);
-		u64 idx = stripe_idx_to_delete(c);
-		mutex_unlock(&c->ec_stripes_heap_lock);
-
-		if (!idx)
-			break;
-
-		int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-					ec_stripe_delete(trans, idx));
-		bch_err_fn(c, ret);
-		if (ret)
-			break;
-	}
-
-	bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
-}
-
-void bch2_do_stripe_deletes(struct bch_fs *c)
-{
-	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
-	    !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
-		bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
-}
-
-/* stripe creation: */
-
-static int ec_stripe_key_update(struct btree_trans *trans,
-				struct bkey_i_stripe *new,
-				bool create)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
-			       new->k.p, BTREE_ITER_intent);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) {
-		bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s",
-				     create ? "creating" : "updating",
-				     bch2_bkey_types[k.k->type]);
-		ret = -EINVAL;
-		goto err;
-	}
-
-	if (k.k->type == KEY_TYPE_stripe) {
-		const struct bch_stripe *old = bkey_s_c_to_stripe(k).v;
-		unsigned i;
-
-		if (old->nr_blocks != new->v.nr_blocks) {
-			bch_err(c, "error updating stripe: nr_blocks does not match");
-			ret = -EINVAL;
-			goto err;
-		}
-
-		for (i = 0; i < new->v.nr_blocks; i++) {
-			unsigned v = stripe_blockcount_get(old, i);
-
-			BUG_ON(v &&
-			       (old->ptrs[i].dev != new->v.ptrs[i].dev ||
-				old->ptrs[i].gen != new->v.ptrs[i].gen ||
-				old->ptrs[i].offset != new->v.ptrs[i].offset));
-
-			stripe_blockcount_set(&new->v, i, v);
-		}
-	}
-
-	ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int ec_stripe_update_extent(struct btree_trans *trans,
-				   struct bch_dev *ca,
-				   struct bpos bucket, u8 gen,
-				   struct ec_stripe_buf *s,
-				   struct bpos *bp_pos)
-{
-	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
-	struct bch_fs *c = trans->c;
-	struct bch_backpointer bp;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	const struct bch_extent_ptr *ptr_c;
-	struct bch_extent_ptr *ec_ptr = NULL;
-	struct bch_extent_stripe_ptr stripe_ptr;
-	struct bkey_i *n;
-	int ret, dev, block;
-
-	ret = bch2_get_next_backpointer(trans, ca, bucket, gen,
-				bp_pos, &bp, BTREE_ITER_cached);
-	if (ret)
-		return ret;
-	if (bpos_eq(*bp_pos, SPOS_MAX))
-		return 0;
-
-	if (bp.level) {
-		struct printbuf buf = PRINTBUF;
-		struct btree_iter node_iter;
-		struct btree *b;
-
-		b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp);
-		bch2_trans_iter_exit(trans, &node_iter);
-
-		if (!b)
-			return 0;
-
-		prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
-		bch2_backpointer_to_text(&buf, &bp);
-
-		bch2_fs_inconsistent(c, "%s", buf.buf);
-		printbuf_exit(&buf);
-		return -EIO;
-	}
-
-	k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_intent);
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
-	if (!k.k) {
-		/*
-		 * extent no longer exists - we could flush the btree
-		 * write buffer and retry to verify, but no need:
-		 */
-		return 0;
-	}
-
-	if (extent_has_stripe_ptr(k, s->key.k.p.offset))
-		goto out;
-
-	ptr_c = bkey_matches_stripe(v, k, &block);
-	/*
-	 * It doesn't generally make sense to erasure code cached ptrs:
-	 * XXX: should we be incrementing a counter?
-	 */
-	if (!ptr_c || ptr_c->cached)
-		goto out;
-
-	dev = v->ptrs[block].dev;
-
-	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr));
-	ret = PTR_ERR_OR_ZERO(n);
-	if (ret)
-		goto out;
-
-	bkey_reassemble(n, k);
-
-	bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
-	ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
-	BUG_ON(!ec_ptr);
-
-	stripe_ptr = (struct bch_extent_stripe_ptr) {
-		.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
-		.block		= block,
-		.redundancy	= v->nr_redundant,
-		.idx		= s->key.k.p.offset,
-	};
-
-	__extent_entry_insert(n,
-			(union bch_extent_entry *) ec_ptr,
-			(union bch_extent_entry *) &stripe_ptr);
-
-	ret = bch2_trans_update(trans, &iter, n, 0);
-out:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s,
-				   unsigned block)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
-	struct bch_extent_ptr ptr = v->ptrs[block];
-	struct bpos bp_pos = POS_MIN;
-	int ret = 0;
-
-	struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
-	if (!ca)
-		return -EIO;
-
-	struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
-
-	while (1) {
-		ret = commit_do(trans, NULL, NULL,
-				BCH_TRANS_COMMIT_no_check_rw|
-				BCH_TRANS_COMMIT_no_enospc,
-			ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, &bp_pos));
-		if (ret)
-			break;
-		if (bkey_eq(bp_pos, POS_MAX))
-			break;
-
-		bp_pos = bpos_nosnap_successor(bp_pos);
-	}
-
-	bch2_dev_put(ca);
-	return ret;
-}
-
-static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
-	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
-	int ret = 0;
-
-	ret = bch2_btree_write_buffer_flush_sync(trans);
-	if (ret)
-		goto err;
-
-	for (i = 0; i < nr_data; i++) {
-		ret = ec_stripe_update_bucket(trans, s, i);
-		if (ret)
-			break;
-	}
-err:
-	bch2_trans_put(trans);
-
-	return ret;
-}
-
-static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
-				       struct ec_stripe_new *s,
-				       unsigned block,
-				       struct open_bucket *ob)
-{
-	struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE);
-	if (!ca) {
-		s->err = -BCH_ERR_erofs_no_writes;
-		return;
-	}
-
-	unsigned offset = ca->mi.bucket_size - ob->sectors_free;
-	memset(s->new_stripe.data[block] + (offset << 9),
-	       0,
-	       ob->sectors_free << 9);
-
-	int ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
-			ob->bucket * ca->mi.bucket_size + offset,
-			ob->sectors_free,
-			GFP_KERNEL, 0);
-
-	percpu_ref_put(&ca->io_ref);
-
-	if (ret)
-		s->err = ret;
-}
-
-void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s)
-{
-	if (s->idx)
-		bch2_stripe_close(c, s);
-	kfree(s);
-}
-
-/*
- * data buckets of new stripe all written: create the stripe
- */
-static void ec_stripe_create(struct ec_stripe_new *s)
-{
-	struct bch_fs *c = s->c;
-	struct open_bucket *ob;
-	struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
-	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
-	int ret;
-
-	BUG_ON(s->h->s == s);
-
-	closure_sync(&s->iodone);
-
-	if (!s->err) {
-		for (i = 0; i < nr_data; i++)
-			if (s->blocks[i]) {
-				ob = c->open_buckets + s->blocks[i];
-
-				if (ob->sectors_free)
-					zero_out_rest_of_ec_bucket(c, s, i, ob);
-			}
-	}
-
-	if (s->err) {
-		if (!bch2_err_matches(s->err, EROFS))
-			bch_err(c, "error creating stripe: error writing data buckets");
-		goto err;
-	}
-
-	if (s->have_existing_stripe) {
-		ec_validate_checksums(c, &s->existing_stripe);
-
-		if (ec_do_recov(c, &s->existing_stripe)) {
-			bch_err(c, "error creating stripe: error reading existing stripe");
-			goto err;
-		}
-
-		for (i = 0; i < nr_data; i++)
-			if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i))
-				swap(s->new_stripe.data[i],
-				     s->existing_stripe.data[i]);
-
-		ec_stripe_buf_exit(&s->existing_stripe);
-	}
-
-	BUG_ON(!s->allocated);
-	BUG_ON(!s->idx);
-
-	ec_generate_ec(&s->new_stripe);
-
-	ec_generate_checksums(&s->new_stripe);
-
-	/* write p/q: */
-	for (i = nr_data; i < v->nr_blocks; i++)
-		ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
-	closure_sync(&s->iodone);
-
-	if (ec_nr_failed(&s->new_stripe)) {
-		bch_err(c, "error creating stripe: error writing redundancy buckets");
-		goto err;
-	}
-
-	ret = bch2_trans_do(c, &s->res, NULL,
-			    BCH_TRANS_COMMIT_no_check_rw|
-			    BCH_TRANS_COMMIT_no_enospc,
-			    ec_stripe_key_update(trans,
-					bkey_i_to_stripe(&s->new_stripe.key),
-					!s->have_existing_stripe));
-	bch_err_msg(c, ret, "creating stripe key");
-	if (ret) {
-		goto err;
-	}
-
-	ret = ec_stripe_update_extents(c, &s->new_stripe);
-	bch_err_msg(c, ret, "error updating extents");
-	if (ret)
-		goto err;
-err:
-	bch2_disk_reservation_put(c, &s->res);
-
-	for (i = 0; i < v->nr_blocks; i++)
-		if (s->blocks[i]) {
-			ob = c->open_buckets + s->blocks[i];
-
-			if (i < nr_data) {
-				ob->ec = NULL;
-				__bch2_open_bucket_put(c, ob);
-			} else {
-				bch2_open_bucket_put(c, ob);
-			}
-		}
-
-	mutex_lock(&c->ec_stripe_new_lock);
-	list_del(&s->list);
-	mutex_unlock(&c->ec_stripe_new_lock);
-	wake_up(&c->ec_stripe_new_wait);
-
-	ec_stripe_buf_exit(&s->existing_stripe);
-	ec_stripe_buf_exit(&s->new_stripe);
-	closure_debug_destroy(&s->iodone);
-
-	ec_stripe_new_put(c, s, STRIPE_REF_stripe);
-}
-
-static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
-{
-	struct ec_stripe_new *s;
-
-	mutex_lock(&c->ec_stripe_new_lock);
-	list_for_each_entry(s, &c->ec_stripe_new_list, list)
-		if (!atomic_read(&s->ref[STRIPE_REF_io]))
-			goto out;
-	s = NULL;
-out:
-	mutex_unlock(&c->ec_stripe_new_lock);
-
-	return s;
-}
-
-static void ec_stripe_create_work(struct work_struct *work)
-{
-	struct bch_fs *c = container_of(work,
-		struct bch_fs, ec_stripe_create_work);
-	struct ec_stripe_new *s;
-
-	while ((s = get_pending_stripe(c)))
-		ec_stripe_create(s);
-
-	bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
-}
-
-void bch2_ec_do_stripe_creates(struct bch_fs *c)
-{
-	bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create);
-
-	if (!queue_work(system_long_wq, &c->ec_stripe_create_work))
-		bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
-}
-
-static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
-{
-	struct ec_stripe_new *s = h->s;
-
-	BUG_ON(!s->allocated && !s->err);
-
-	h->s		= NULL;
-	s->pending	= true;
-
-	mutex_lock(&c->ec_stripe_new_lock);
-	list_add(&s->list, &c->ec_stripe_new_list);
-	mutex_unlock(&c->ec_stripe_new_lock);
-
-	ec_stripe_new_put(c, s, STRIPE_REF_io);
-}
-
-void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
-{
-	struct ec_stripe_new *s = ob->ec;
-
-	s->err = -EIO;
-}
-
-void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
-{
-	struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
-	if (!ob)
-		return NULL;
-
-	BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]);
-
-	struct bch_dev *ca	= ob_dev(c, ob);
-	unsigned offset		= ca->mi.bucket_size - ob->sectors_free;
-
-	return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
-}
-
-static int unsigned_cmp(const void *_l, const void *_r)
-{
-	unsigned l = *((const unsigned *) _l);
-	unsigned r = *((const unsigned *) _r);
-
-	return cmp_int(l, r);
-}
-
-/* pick most common bucket size: */
-static unsigned pick_blocksize(struct bch_fs *c,
-			       struct bch_devs_mask *devs)
-{
-	unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX];
-	struct {
-		unsigned nr, size;
-	} cur = { 0, 0 }, best = { 0, 0 };
-
-	for_each_member_device_rcu(c, ca, devs)
-		sizes[nr++] = ca->mi.bucket_size;
-
-	sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
-
-	for (unsigned i = 0; i < nr; i++) {
-		if (sizes[i] != cur.size) {
-			if (cur.nr > best.nr)
-				best = cur;
-
-			cur.nr = 0;
-			cur.size = sizes[i];
-		}
-
-		cur.nr++;
-	}
-
-	if (cur.nr > best.nr)
-		best = cur;
-
-	return best.size;
-}
-
-static bool may_create_new_stripe(struct bch_fs *c)
-{
-	return false;
-}
-
-static void ec_stripe_key_init(struct bch_fs *c,
-			       struct bkey_i *k,
-			       unsigned nr_data,
-			       unsigned nr_parity,
-			       unsigned stripe_size)
-{
-	struct bkey_i_stripe *s = bkey_stripe_init(k);
-	unsigned u64s;
-
-	s->v.sectors			= cpu_to_le16(stripe_size);
-	s->v.algorithm			= 0;
-	s->v.nr_blocks			= nr_data + nr_parity;
-	s->v.nr_redundant		= nr_parity;
-	s->v.csum_granularity_bits	= ilog2(c->opts.encoded_extent_max >> 9);
-	s->v.csum_type			= BCH_CSUM_crc32c;
-	s->v.pad			= 0;
-
-	while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
-		BUG_ON(1 << s->v.csum_granularity_bits >=
-		       le16_to_cpu(s->v.sectors) ||
-		       s->v.csum_granularity_bits == U8_MAX);
-		s->v.csum_granularity_bits++;
-	}
-
-	set_bkey_val_u64s(&s->k, u64s);
-}
-
-static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
-{
-	struct ec_stripe_new *s;
-
-	lockdep_assert_held(&h->lock);
-
-	s = kzalloc(sizeof(*s), GFP_KERNEL);
-	if (!s)
-		return -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
-
-	mutex_init(&s->lock);
-	closure_init(&s->iodone, NULL);
-	atomic_set(&s->ref[STRIPE_REF_stripe], 1);
-	atomic_set(&s->ref[STRIPE_REF_io], 1);
-	s->c		= c;
-	s->h		= h;
-	s->nr_data	= min_t(unsigned, h->nr_active_devs,
-				BCH_BKEY_PTRS_MAX) - h->redundancy;
-	s->nr_parity	= h->redundancy;
-
-	ec_stripe_key_init(c, &s->new_stripe.key,
-			   s->nr_data, s->nr_parity, h->blocksize);
-
-	h->s = s;
-	return 0;
-}
-
-static struct ec_stripe_head *
-ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
-			 unsigned algo, unsigned redundancy,
-			 enum bch_watermark watermark)
-{
-	struct ec_stripe_head *h;
-
-	h = kzalloc(sizeof(*h), GFP_KERNEL);
-	if (!h)
-		return NULL;
-
-	mutex_init(&h->lock);
-	BUG_ON(!mutex_trylock(&h->lock));
-
-	h->target	= target;
-	h->algo		= algo;
-	h->redundancy	= redundancy;
-	h->watermark	= watermark;
-
-	rcu_read_lock();
-	h->devs = target_rw_devs(c, BCH_DATA_user, target);
-
-	for_each_member_device_rcu(c, ca, &h->devs)
-		if (!ca->mi.durability)
-			__clear_bit(ca->dev_idx, h->devs.d);
-
-	h->blocksize = pick_blocksize(c, &h->devs);
-
-	for_each_member_device_rcu(c, ca, &h->devs)
-		if (ca->mi.bucket_size == h->blocksize)
-			h->nr_active_devs++;
-
-	rcu_read_unlock();
-
-	/*
-	 * If we only have redundancy + 1 devices, we're better off with just
-	 * replication:
-	 */
-	if (h->nr_active_devs < h->redundancy + 2)
-		bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
-			h->nr_active_devs, h->redundancy + 2);
-
-	list_add(&h->list, &c->ec_stripe_head_list);
-	return h;
-}
-
-void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
-{
-	if (h->s &&
-	    h->s->allocated &&
-	    bitmap_weight(h->s->blocks_allocated,
-			  h->s->nr_data) == h->s->nr_data)
-		ec_stripe_set_pending(c, h);
-
-	mutex_unlock(&h->lock);
-}
-
-static struct ec_stripe_head *
-__bch2_ec_stripe_head_get(struct btree_trans *trans,
-			  unsigned target,
-			  unsigned algo,
-			  unsigned redundancy,
-			  enum bch_watermark watermark)
-{
-	struct bch_fs *c = trans->c;
-	struct ec_stripe_head *h;
-	int ret;
-
-	if (!redundancy)
-		return NULL;
-
-	ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock);
-	if (ret)
-		return ERR_PTR(ret);
-
-	if (test_bit(BCH_FS_going_ro, &c->flags)) {
-		h = ERR_PTR(-BCH_ERR_erofs_no_writes);
-		goto found;
-	}
-
-	list_for_each_entry(h, &c->ec_stripe_head_list, list)
-		if (h->target		== target &&
-		    h->algo		== algo &&
-		    h->redundancy	== redundancy &&
-		    h->watermark	== watermark) {
-			ret = bch2_trans_mutex_lock(trans, &h->lock);
-			if (ret)
-				h = ERR_PTR(ret);
-			goto found;
-		}
-
-	h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
-found:
-	if (!IS_ERR_OR_NULL(h) &&
-	    h->nr_active_devs < h->redundancy + 2) {
-		mutex_unlock(&h->lock);
-		h = NULL;
-	}
-	mutex_unlock(&c->ec_stripe_head_lock);
-	return h;
-}
-
-static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h,
-				    enum bch_watermark watermark, struct closure *cl)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_devs_mask devs = h->devs;
-	struct open_bucket *ob;
-	struct open_buckets buckets;
-	struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
-	unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
-	bool have_cache = true;
-	int ret = 0;
-
-	BUG_ON(v->nr_blocks	!= h->s->nr_data + h->s->nr_parity);
-	BUG_ON(v->nr_redundant	!= h->s->nr_parity);
-
-	for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
-		__clear_bit(v->ptrs[i].dev, devs.d);
-		if (i < h->s->nr_data)
-			nr_have_data++;
-		else
-			nr_have_parity++;
-	}
-
-	BUG_ON(nr_have_data	> h->s->nr_data);
-	BUG_ON(nr_have_parity	> h->s->nr_parity);
-
-	buckets.nr = 0;
-	if (nr_have_parity < h->s->nr_parity) {
-		ret = bch2_bucket_alloc_set_trans(trans, &buckets,
-					    &h->parity_stripe,
-					    &devs,
-					    h->s->nr_parity,
-					    &nr_have_parity,
-					    &have_cache, 0,
-					    BCH_DATA_parity,
-					    watermark,
-					    cl);
-
-		open_bucket_for_each(c, &buckets, ob, i) {
-			j = find_next_zero_bit(h->s->blocks_gotten,
-					       h->s->nr_data + h->s->nr_parity,
-					       h->s->nr_data);
-			BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
-
-			h->s->blocks[j] = buckets.v[i];
-			v->ptrs[j] = bch2_ob_ptr(c, ob);
-			__set_bit(j, h->s->blocks_gotten);
-		}
-
-		if (ret)
-			return ret;
-	}
-
-	buckets.nr = 0;
-	if (nr_have_data < h->s->nr_data) {
-		ret = bch2_bucket_alloc_set_trans(trans, &buckets,
-					    &h->block_stripe,
-					    &devs,
-					    h->s->nr_data,
-					    &nr_have_data,
-					    &have_cache, 0,
-					    BCH_DATA_user,
-					    watermark,
-					    cl);
-
-		open_bucket_for_each(c, &buckets, ob, i) {
-			j = find_next_zero_bit(h->s->blocks_gotten,
-					       h->s->nr_data, 0);
-			BUG_ON(j >= h->s->nr_data);
-
-			h->s->blocks[j] = buckets.v[i];
-			v->ptrs[j] = bch2_ob_ptr(c, ob);
-			__set_bit(j, h->s->blocks_gotten);
-		}
-
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-/* XXX: doesn't obey target: */
-static s64 get_existing_stripe(struct bch_fs *c,
-			       struct ec_stripe_head *head)
-{
-	ec_stripes_heap *h = &c->ec_stripes_heap;
-	struct stripe *m;
-	size_t heap_idx;
-	u64 stripe_idx;
-	s64 ret = -1;
-
-	if (may_create_new_stripe(c))
-		return -1;
-
-	mutex_lock(&c->ec_stripes_heap_lock);
-	for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
-		/* No blocks worth reusing, stripe will just be deleted: */
-		if (!h->data[heap_idx].blocks_nonempty)
-			continue;
-
-		stripe_idx = h->data[heap_idx].idx;
-
-		m = genradix_ptr(&c->stripes, stripe_idx);
-
-		if (m->algorithm	== head->algo &&
-		    m->nr_redundant	== head->redundancy &&
-		    m->sectors		== head->blocksize &&
-		    m->blocks_nonempty	< m->nr_blocks - m->nr_redundant &&
-		    bch2_try_open_stripe(c, head->s, stripe_idx)) {
-			ret = stripe_idx;
-			break;
-		}
-	}
-	mutex_unlock(&c->ec_stripes_heap_lock);
-	return ret;
-}
-
-static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
-	struct bch_stripe *existing_v;
-	unsigned i;
-	s64 idx;
-	int ret;
-
-	/*
-	 * If we can't allocate a new stripe, and there's no stripes with empty
-	 * blocks for us to reuse, that means we have to wait on copygc:
-	 */
-	idx = get_existing_stripe(c, h);
-	if (idx < 0)
-		return -BCH_ERR_stripe_alloc_blocked;
-
-	ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
-	bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
-			     "reading stripe key: %s", bch2_err_str(ret));
-	if (ret) {
-		bch2_stripe_close(c, h->s);
-		return ret;
-	}
-
-	existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v;
-
-	BUG_ON(existing_v->nr_redundant != h->s->nr_parity);
-	h->s->nr_data = existing_v->nr_blocks -
-		existing_v->nr_redundant;
-
-	ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize);
-	if (ret) {
-		bch2_stripe_close(c, h->s);
-		return ret;
-	}
-
-	BUG_ON(h->s->existing_stripe.size != h->blocksize);
-	BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
-
-	/*
-	 * Free buckets we initially allocated - they might conflict with
-	 * blocks from the stripe we're reusing:
-	 */
-	for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) {
-		bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]);
-		h->s->blocks[i] = 0;
-	}
-	memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten));
-	memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated));
-
-	for (i = 0; i < existing_v->nr_blocks; i++) {
-		if (stripe_blockcount_get(existing_v, i)) {
-			__set_bit(i, h->s->blocks_gotten);
-			__set_bit(i, h->s->blocks_allocated);
-		}
-
-		ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
-	}
-
-	bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key);
-	h->s->have_existing_stripe = true;
-
-	return 0;
-}
-
-static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bpos min_pos = POS(0, 1);
-	struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
-	int ret;
-
-	if (!h->s->res.sectors) {
-		ret = bch2_disk_reservation_get(c, &h->s->res,
-					h->blocksize,
-					h->s->nr_parity,
-					BCH_DISK_RESERVATION_NOFAIL);
-		if (ret)
-			return ret;
-	}
-
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
-			   BTREE_ITER_slots|BTREE_ITER_intent, k, ret) {
-		if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
-			if (start_pos.offset) {
-				start_pos = min_pos;
-				bch2_btree_iter_set_pos(&iter, start_pos);
-				continue;
-			}
-
-			ret = -BCH_ERR_ENOSPC_stripe_create;
-			break;
-		}
-
-		if (bkey_deleted(k.k) &&
-		    bch2_try_open_stripe(c, h->s, k.k->p.offset))
-			break;
-	}
-
-	c->ec_stripe_hint = iter.pos.offset;
-
-	if (ret)
-		goto err;
-
-	ret = ec_stripe_mem_alloc(trans, &iter);
-	if (ret) {
-		bch2_stripe_close(c, h->s);
-		goto err;
-	}
-
-	h->s->new_stripe.key.k.p = iter.pos;
-out:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-err:
-	bch2_disk_reservation_put(c, &h->s->res);
-	goto out;
-}
-
-struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
-					       unsigned target,
-					       unsigned algo,
-					       unsigned redundancy,
-					       enum bch_watermark watermark,
-					       struct closure *cl)
-{
-	struct bch_fs *c = trans->c;
-	struct ec_stripe_head *h;
-	bool waiting = false;
-	int ret;
-
-	h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
-	if (IS_ERR_OR_NULL(h))
-		return h;
-
-	if (!h->s) {
-		ret = ec_new_stripe_alloc(c, h);
-		if (ret) {
-			bch_err(c, "failed to allocate new stripe");
-			goto err;
-		}
-	}
-
-	if (h->s->allocated)
-		goto allocated;
-
-	if (h->s->have_existing_stripe)
-		goto alloc_existing;
-
-	/* First, try to allocate a full stripe: */
-	ret =   new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?:
-		__bch2_ec_stripe_head_reserve(trans, h);
-	if (!ret)
-		goto allocate_buf;
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
-	    bch2_err_matches(ret, ENOMEM))
-		goto err;
-
-	/*
-	 * Not enough buckets available for a full stripe: we must reuse an
-	 * existing stripe:
-	 */
-	while (1) {
-		ret = __bch2_ec_stripe_head_reuse(trans, h);
-		if (!ret)
-			break;
-		if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
-			goto err;
-
-		if (watermark == BCH_WATERMARK_copygc) {
-			ret =   new_stripe_alloc_buckets(trans, h, watermark, NULL) ?:
-				__bch2_ec_stripe_head_reserve(trans, h);
-			if (ret)
-				goto err;
-			goto allocate_buf;
-		}
-
-		/* XXX freelist_wait? */
-		closure_wait(&c->freelist_wait, cl);
-		waiting = true;
-	}
-
-	if (waiting)
-		closure_wake_up(&c->freelist_wait);
-alloc_existing:
-	/*
-	 * Retry allocating buckets, with the watermark for this
-	 * particular write:
-	 */
-	ret = new_stripe_alloc_buckets(trans, h, watermark, cl);
-	if (ret)
-		goto err;
-
-allocate_buf:
-	ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize);
-	if (ret)
-		goto err;
-
-	h->s->allocated = true;
-allocated:
-	BUG_ON(!h->s->idx);
-	BUG_ON(!h->s->new_stripe.data[0]);
-	BUG_ON(trans->restarted);
-	return h;
-err:
-	bch2_ec_stripe_head_put(c, h);
-	return ERR_PTR(ret);
-}
-
-static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
-{
-	struct ec_stripe_head *h;
-	struct open_bucket *ob;
-	unsigned i;
-
-	mutex_lock(&c->ec_stripe_head_lock);
-	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
-		mutex_lock(&h->lock);
-		if (!h->s)
-			goto unlock;
-
-		if (!ca)
-			goto found;
-
-		for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) {
-			if (!h->s->blocks[i])
-				continue;
-
-			ob = c->open_buckets + h->s->blocks[i];
-			if (ob->dev == ca->dev_idx)
-				goto found;
-		}
-		goto unlock;
-found:
-		h->s->err = -BCH_ERR_erofs_no_writes;
-		ec_stripe_set_pending(c, h);
-unlock:
-		mutex_unlock(&h->lock);
-	}
-	mutex_unlock(&c->ec_stripe_head_lock);
-}
-
-void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
-{
-	__bch2_ec_stop(c, ca);
-}
-
-void bch2_fs_ec_stop(struct bch_fs *c)
-{
-	__bch2_ec_stop(c, NULL);
-}
-
-static bool bch2_fs_ec_flush_done(struct bch_fs *c)
-{
-	bool ret;
-
-	mutex_lock(&c->ec_stripe_new_lock);
-	ret = list_empty(&c->ec_stripe_new_list);
-	mutex_unlock(&c->ec_stripe_new_lock);
-
-	return ret;
-}
-
-void bch2_fs_ec_flush(struct bch_fs *c)
-{
-	wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c));
-}
-
-int bch2_stripes_read(struct bch_fs *c)
-{
-	int ret = bch2_trans_run(c,
-		for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
-				   BTREE_ITER_prefetch, k, ({
-			if (k.k->type != KEY_TYPE_stripe)
-				continue;
-
-			ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
-			if (ret)
-				break;
-
-			const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
-
-			struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
-			m->sectors	= le16_to_cpu(s->sectors);
-			m->algorithm	= s->algorithm;
-			m->nr_blocks	= s->nr_blocks;
-			m->nr_redundant	= s->nr_redundant;
-			m->blocks_nonempty = 0;
-
-			for (unsigned i = 0; i < s->nr_blocks; i++)
-				m->blocks_nonempty += !!stripe_blockcount_get(s, i);
-
-			bch2_stripes_heap_insert(c, m, k.k->p.offset);
-			0;
-		})));
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	ec_stripes_heap *h = &c->ec_stripes_heap;
-	struct stripe *m;
-	size_t i;
-
-	mutex_lock(&c->ec_stripes_heap_lock);
-	for (i = 0; i < min_t(size_t, h->used, 50); i++) {
-		m = genradix_ptr(&c->stripes, h->data[i].idx);
-
-		prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
-		       h->data[i].blocks_nonempty,
-		       m->nr_blocks - m->nr_redundant,
-		       m->nr_redundant);
-		if (bch2_stripe_is_open(c, h->data[i].idx))
-			prt_str(out, " open");
-		prt_newline(out);
-	}
-	mutex_unlock(&c->ec_stripes_heap_lock);
-}
-
-void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	struct ec_stripe_head *h;
-	struct ec_stripe_new *s;
-
-	mutex_lock(&c->ec_stripe_head_lock);
-	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
-		prt_printf(out, "target %u algo %u redundancy %u %s:\n",
-		       h->target, h->algo, h->redundancy,
-		       bch2_watermarks[h->watermark]);
-
-		if (h->s)
-			prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n",
-			       h->s->idx, h->s->nr_data, h->s->nr_parity,
-			       bitmap_weight(h->s->blocks_allocated,
-					     h->s->nr_data));
-	}
-	mutex_unlock(&c->ec_stripe_head_lock);
-
-	prt_printf(out, "in flight:\n");
-
-	mutex_lock(&c->ec_stripe_new_lock);
-	list_for_each_entry(s, &c->ec_stripe_new_list, list) {
-		prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n",
-			   s->idx, s->nr_data, s->nr_parity,
-			   atomic_read(&s->ref[STRIPE_REF_io]),
-			   atomic_read(&s->ref[STRIPE_REF_stripe]),
-			   bch2_watermarks[s->h->watermark]);
-	}
-	mutex_unlock(&c->ec_stripe_new_lock);
-}
-
-void bch2_fs_ec_exit(struct bch_fs *c)
-{
-	struct ec_stripe_head *h;
-	unsigned i;
-
-	while (1) {
-		mutex_lock(&c->ec_stripe_head_lock);
-		h = list_first_entry_or_null(&c->ec_stripe_head_list,
-					     struct ec_stripe_head, list);
-		if (h)
-			list_del(&h->list);
-		mutex_unlock(&c->ec_stripe_head_lock);
-		if (!h)
-			break;
-
-		if (h->s) {
-			for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++)
-				BUG_ON(h->s->blocks[i]);
-
-			kfree(h->s);
-		}
-		kfree(h);
-	}
-
-	BUG_ON(!list_empty(&c->ec_stripe_new_list));
-
-	free_heap(&c->ec_stripes_heap);
-	genradix_free(&c->stripes);
-	bioset_exit(&c->ec_bioset);
-}
-
-void bch2_fs_ec_init_early(struct bch_fs *c)
-{
-	spin_lock_init(&c->ec_stripes_new_lock);
-	mutex_init(&c->ec_stripes_heap_lock);
-
-	INIT_LIST_HEAD(&c->ec_stripe_head_list);
-	mutex_init(&c->ec_stripe_head_lock);
-
-	INIT_LIST_HEAD(&c->ec_stripe_new_list);
-	mutex_init(&c->ec_stripe_new_lock);
-	init_waitqueue_head(&c->ec_stripe_new_wait);
-
-	INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
-	INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
-}
-
-int bch2_fs_ec_init(struct bch_fs *c)
-{
-	return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
-			   BIOSET_NEED_BVECS);
-}
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
deleted file mode 100644
index 84a23eeb6249..000000000000
--- a/fs/bcachefs/ec.h
+++ /dev/null
@@ -1,264 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EC_H
-#define _BCACHEFS_EC_H
-
-#include "ec_types.h"
-#include "buckets_types.h"
-#include "extents_types.h"
-
-enum bch_validate_flags;
-
-int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c,
-			enum bch_validate_flags, struct printbuf *);
-void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
-			 struct bkey_s_c);
-int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned,
-			struct bkey_s_c, struct bkey_s,
-			enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_stripe ((struct bkey_ops) {	\
-	.key_invalid	= bch2_stripe_invalid,		\
-	.val_to_text	= bch2_stripe_to_text,		\
-	.swab		= bch2_ptr_swab,		\
-	.trigger	= bch2_trigger_stripe,		\
-	.min_val_size	= 8,				\
-})
-
-static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
-{
-	return DIV_ROUND_UP(le16_to_cpu(s->sectors),
-			    1 << s->csum_granularity_bits);
-}
-
-static inline unsigned stripe_csum_offset(const struct bch_stripe *s,
-					  unsigned dev, unsigned csum_idx)
-{
-	EBUG_ON(s->csum_type >= BCH_CSUM_NR);
-
-	unsigned csum_bytes = bch_crc_bytes[s->csum_type];
-
-	return sizeof(struct bch_stripe) +
-		sizeof(struct bch_extent_ptr) * s->nr_blocks +
-		(dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
-}
-
-static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s,
-						unsigned idx)
-{
-	return stripe_csum_offset(s, s->nr_blocks, 0) +
-		sizeof(u16) * idx;
-}
-
-static inline unsigned stripe_blockcount_get(const struct bch_stripe *s,
-					     unsigned idx)
-{
-	return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx));
-}
-
-static inline void stripe_blockcount_set(struct bch_stripe *s,
-					 unsigned idx, unsigned v)
-{
-	__le16 *p = (void *) s + stripe_blockcount_offset(s, idx);
-
-	*p = cpu_to_le16(v);
-}
-
-static inline unsigned stripe_val_u64s(const struct bch_stripe *s)
-{
-	return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks),
-			    sizeof(u64));
-}
-
-static inline void *stripe_csum(struct bch_stripe *s,
-				unsigned block, unsigned csum_idx)
-{
-	EBUG_ON(block >= s->nr_blocks);
-	EBUG_ON(csum_idx >= stripe_csums_per_device(s));
-
-	return (void *) s + stripe_csum_offset(s, block, csum_idx);
-}
-
-static inline struct bch_csum stripe_csum_get(struct bch_stripe *s,
-				   unsigned block, unsigned csum_idx)
-{
-	struct bch_csum csum = { 0 };
-
-	memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]);
-	return csum;
-}
-
-static inline void stripe_csum_set(struct bch_stripe *s,
-				   unsigned block, unsigned csum_idx,
-				   struct bch_csum csum)
-{
-	memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]);
-}
-
-static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr,
-					     const struct bch_extent_ptr *data_ptr,
-					     unsigned sectors)
-{
-	return  data_ptr->dev    == stripe_ptr->dev &&
-		data_ptr->gen    == stripe_ptr->gen &&
-		data_ptr->offset >= stripe_ptr->offset &&
-		data_ptr->offset  < stripe_ptr->offset + sectors;
-}
-
-static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s,
-					   struct extent_ptr_decoded p)
-{
-	unsigned nr_data = s->nr_blocks - s->nr_redundant;
-
-	BUG_ON(!p.has_ec);
-
-	if (p.ec.block >= nr_data)
-		return false;
-
-	return __bch2_ptr_matches_stripe(&s->ptrs[p.ec.block], &p.ptr,
-					 le16_to_cpu(s->sectors));
-}
-
-static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m,
-					     struct extent_ptr_decoded p)
-{
-	unsigned nr_data = m->nr_blocks - m->nr_redundant;
-
-	BUG_ON(!p.has_ec);
-
-	if (p.ec.block >= nr_data)
-		return false;
-
-	return __bch2_ptr_matches_stripe(&m->ptrs[p.ec.block], &p.ptr,
-					 m->sectors);
-}
-
-struct bch_read_bio;
-
-struct ec_stripe_buf {
-	/* might not be buffering the entire stripe: */
-	unsigned		offset;
-	unsigned		size;
-	unsigned long		valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
-
-	void			*data[BCH_BKEY_PTRS_MAX];
-
-	__BKEY_PADDED(key, 255);
-};
-
-struct ec_stripe_head;
-
-enum ec_stripe_ref {
-	STRIPE_REF_io,
-	STRIPE_REF_stripe,
-	STRIPE_REF_NR
-};
-
-struct ec_stripe_new {
-	struct bch_fs		*c;
-	struct ec_stripe_head	*h;
-	struct mutex		lock;
-	struct list_head	list;
-
-	struct hlist_node	hash;
-	u64			idx;
-
-	struct closure		iodone;
-
-	atomic_t		ref[STRIPE_REF_NR];
-
-	int			err;
-
-	u8			nr_data;
-	u8			nr_parity;
-	bool			allocated;
-	bool			pending;
-	bool			have_existing_stripe;
-
-	unsigned long		blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
-	unsigned long		blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
-	open_bucket_idx_t	blocks[BCH_BKEY_PTRS_MAX];
-	struct disk_reservation	res;
-
-	struct ec_stripe_buf	new_stripe;
-	struct ec_stripe_buf	existing_stripe;
-};
-
-struct ec_stripe_head {
-	struct list_head	list;
-	struct mutex		lock;
-
-	unsigned		target;
-	unsigned		algo;
-	unsigned		redundancy;
-	enum bch_watermark	watermark;
-
-	struct bch_devs_mask	devs;
-	unsigned		nr_active_devs;
-
-	unsigned		blocksize;
-
-	struct dev_stripe_state	block_stripe;
-	struct dev_stripe_state	parity_stripe;
-
-	struct ec_stripe_new	*s;
-};
-
-int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *);
-
-void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
-
-void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
-
-int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
-
-void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
-struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
-			unsigned, unsigned, unsigned,
-			enum bch_watermark, struct closure *);
-
-void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
-void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
-void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
-
-void bch2_do_stripe_deletes(struct bch_fs *);
-void bch2_ec_do_stripe_creates(struct bch_fs *);
-void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *);
-
-static inline void ec_stripe_new_get(struct ec_stripe_new *s,
-				     enum ec_stripe_ref ref)
-{
-	atomic_inc(&s->ref[ref]);
-}
-
-static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
-				     enum ec_stripe_ref ref)
-{
-	BUG_ON(atomic_read(&s->ref[ref]) <= 0);
-
-	if (atomic_dec_and_test(&s->ref[ref]))
-		switch (ref) {
-		case STRIPE_REF_stripe:
-			bch2_ec_stripe_new_free(c, s);
-			break;
-		case STRIPE_REF_io:
-			bch2_ec_do_stripe_creates(c);
-			break;
-		default:
-			BUG();
-		}
-}
-
-void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
-void bch2_fs_ec_stop(struct bch_fs *);
-void bch2_fs_ec_flush(struct bch_fs *);
-
-int bch2_stripes_read(struct bch_fs *);
-
-void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
-void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_fs_ec_exit(struct bch_fs *);
-void bch2_fs_ec_init_early(struct bch_fs *);
-int bch2_fs_ec_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_EC_H */
diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h
deleted file mode 100644
index 44ce88ba08d7..000000000000
--- a/fs/bcachefs/ec_format.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EC_FORMAT_H
-#define _BCACHEFS_EC_FORMAT_H
-
-struct bch_stripe {
-	struct bch_val		v;
-	__le16			sectors;
-	__u8			algorithm;
-	__u8			nr_blocks;
-	__u8			nr_redundant;
-
-	__u8			csum_granularity_bits;
-	__u8			csum_type;
-	__u8			pad;
-
-	struct bch_extent_ptr	ptrs[];
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_EC_FORMAT_H */
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
deleted file mode 100644
index 976426da3a12..000000000000
--- a/fs/bcachefs/ec_types.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EC_TYPES_H
-#define _BCACHEFS_EC_TYPES_H
-
-#include "bcachefs_format.h"
-
-struct bch_replicas_padded {
-	struct bch_replicas_entry_v1	e;
-	u8				pad[BCH_BKEY_PTRS_MAX];
-};
-
-struct stripe {
-	size_t			heap_idx;
-	u16			sectors;
-	u8			algorithm;
-	u8			nr_blocks;
-	u8			nr_redundant;
-	u8			blocks_nonempty;
-};
-
-struct gc_stripe {
-	u16			sectors;
-
-	u8			nr_blocks;
-	u8			nr_redundant;
-
-	unsigned		alive:1; /* does a corresponding key exist in stripes btree? */
-	u16			block_sectors[BCH_BKEY_PTRS_MAX];
-	struct bch_extent_ptr	ptrs[BCH_BKEY_PTRS_MAX];
-
-	struct bch_replicas_padded r;
-};
-
-struct ec_stripe_heap_entry {
-	size_t			idx;
-	unsigned		blocks_nonempty;
-};
-
-typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap;
-
-#endif /* _BCACHEFS_EC_TYPES_H */
diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
deleted file mode 100644
index 43557bebd0f8..000000000000
--- a/fs/bcachefs/errcode.c
+++ /dev/null
@@ -1,71 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "errcode.h"
-#include "trace.h"
-
-#include <linux/errname.h>
-
-static const char * const bch2_errcode_strs[] = {
-#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = #err,
-	BCH_ERRCODES()
-#undef x
-	NULL
-};
-
-static unsigned bch2_errcode_parents[] = {
-#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class,
-	BCH_ERRCODES()
-#undef x
-};
-
-const char *bch2_err_str(int err)
-{
-	const char *errstr;
-
-	err = abs(err);
-
-	BUG_ON(err >= BCH_ERR_MAX);
-
-	if (err >= BCH_ERR_START)
-		errstr = bch2_errcode_strs[err - BCH_ERR_START];
-	else if (err)
-		errstr = errname(err);
-	else
-		errstr = "(No error)";
-	return errstr ?: "(Invalid error)";
-}
-
-bool __bch2_err_matches(int err, int class)
-{
-	err	= abs(err);
-	class	= abs(class);
-
-	BUG_ON(err	>= BCH_ERR_MAX);
-	BUG_ON(class	>= BCH_ERR_MAX);
-
-	while (err >= BCH_ERR_START && err != class)
-		err = bch2_errcode_parents[err - BCH_ERR_START];
-
-	return err == class;
-}
-
-int __bch2_err_class(int bch_err)
-{
-	int std_err = -bch_err;
-	BUG_ON((unsigned) std_err >= BCH_ERR_MAX);
-
-	while (std_err >= BCH_ERR_START && bch2_errcode_parents[std_err - BCH_ERR_START])
-		std_err = bch2_errcode_parents[std_err - BCH_ERR_START];
-
-	trace_error_downcast(bch_err, std_err, _RET_IP_);
-
-	return -std_err;
-}
-
-const char *bch2_blk_status_to_str(blk_status_t status)
-{
-	if (status == BLK_STS_REMOVED)
-		return "device removed";
-	return blk_status_to_str(status);
-}
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
deleted file mode 100644
index dbe35b80bc0b..000000000000
--- a/fs/bcachefs/errcode.h
+++ /dev/null
@@ -1,292 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ERRCODE_H
-#define _BCACHEFS_ERRCODE_H
-
-#define BCH_ERRCODES()								\
-	x(ERANGE,			ERANGE_option_too_small)		\
-	x(ERANGE,			ERANGE_option_too_big)			\
-	x(EINVAL,			mount_option)				\
-	x(BCH_ERR_mount_option,		option_name)				\
-	x(BCH_ERR_mount_option,		option_value)				\
-	x(BCH_ERR_mount_option,         option_not_bool)                        \
-	x(ENOMEM,			ENOMEM_stripe_buf)			\
-	x(ENOMEM,			ENOMEM_replicas_table)			\
-	x(ENOMEM,			ENOMEM_cpu_replicas)			\
-	x(ENOMEM,			ENOMEM_replicas_gc)			\
-	x(ENOMEM,			ENOMEM_disk_groups_validate)		\
-	x(ENOMEM,			ENOMEM_disk_groups_to_cpu)		\
-	x(ENOMEM,			ENOMEM_mark_snapshot)			\
-	x(ENOMEM,			ENOMEM_mark_stripe)			\
-	x(ENOMEM,			ENOMEM_mark_stripe_ptr)			\
-	x(ENOMEM,			ENOMEM_btree_key_cache_create)		\
-	x(ENOMEM,			ENOMEM_btree_key_cache_fill)		\
-	x(ENOMEM,			ENOMEM_btree_key_cache_insert)		\
-	x(ENOMEM,			ENOMEM_trans_kmalloc)			\
-	x(ENOMEM,			ENOMEM_trans_log_msg)			\
-	x(ENOMEM,			ENOMEM_do_encrypt)			\
-	x(ENOMEM,			ENOMEM_ec_read_extent)			\
-	x(ENOMEM,			ENOMEM_ec_stripe_mem_alloc)		\
-	x(ENOMEM,			ENOMEM_ec_new_stripe_alloc)		\
-	x(ENOMEM,			ENOMEM_fs_btree_cache_init)		\
-	x(ENOMEM,			ENOMEM_fs_btree_key_cache_init)		\
-	x(ENOMEM,			ENOMEM_fs_counters_init)		\
-	x(ENOMEM,			ENOMEM_fs_btree_write_buffer_init)	\
-	x(ENOMEM,			ENOMEM_io_clock_init)			\
-	x(ENOMEM,			ENOMEM_blacklist_table_init)		\
-	x(ENOMEM,			ENOMEM_sb_realloc_injected)		\
-	x(ENOMEM,			ENOMEM_sb_bio_realloc)			\
-	x(ENOMEM,			ENOMEM_sb_buf_realloc)			\
-	x(ENOMEM,			ENOMEM_sb_journal_validate)		\
-	x(ENOMEM,			ENOMEM_sb_journal_v2_validate)		\
-	x(ENOMEM,			ENOMEM_journal_entry_add)		\
-	x(ENOMEM,			ENOMEM_journal_read_buf_realloc)	\
-	x(ENOMEM,			ENOMEM_btree_interior_update_worker_init)\
-	x(ENOMEM,			ENOMEM_btree_interior_update_pool_init)	\
-	x(ENOMEM,			ENOMEM_bio_read_init)			\
-	x(ENOMEM,			ENOMEM_bio_read_split_init)		\
-	x(ENOMEM,			ENOMEM_bio_write_init)			\
-	x(ENOMEM,			ENOMEM_bio_bounce_pages_init)		\
-	x(ENOMEM,			ENOMEM_writepage_bioset_init)		\
-	x(ENOMEM,			ENOMEM_dio_read_bioset_init)		\
-	x(ENOMEM,			ENOMEM_dio_write_bioset_init)		\
-	x(ENOMEM,			ENOMEM_nocow_flush_bioset_init)		\
-	x(ENOMEM,			ENOMEM_promote_table_init)		\
-	x(ENOMEM,			ENOMEM_compression_bounce_read_init)	\
-	x(ENOMEM,			ENOMEM_compression_bounce_write_init)	\
-	x(ENOMEM,			ENOMEM_compression_workspace_init)	\
-	x(ENOMEM,			ENOMEM_decompression_workspace_init)	\
-	x(ENOMEM,			ENOMEM_bucket_gens)			\
-	x(ENOMEM,			ENOMEM_buckets_nouse)			\
-	x(ENOMEM,			ENOMEM_usage_init)			\
-	x(ENOMEM,			ENOMEM_btree_node_read_all_replicas)	\
-	x(ENOMEM,			ENOMEM_btree_node_reclaim)		\
-	x(ENOMEM,			ENOMEM_btree_node_mem_alloc)		\
-	x(ENOMEM,			ENOMEM_btree_cache_cannibalize_lock)	\
-	x(ENOMEM,			ENOMEM_buckets_waiting_for_journal_init)\
-	x(ENOMEM,			ENOMEM_buckets_waiting_for_journal_set)	\
-	x(ENOMEM,			ENOMEM_set_nr_journal_buckets)		\
-	x(ENOMEM,			ENOMEM_dev_journal_init)		\
-	x(ENOMEM,			ENOMEM_journal_pin_fifo)		\
-	x(ENOMEM,			ENOMEM_journal_buf)			\
-	x(ENOMEM,			ENOMEM_gc_start)			\
-	x(ENOMEM,			ENOMEM_gc_alloc_start)			\
-	x(ENOMEM,			ENOMEM_gc_reflink_start)		\
-	x(ENOMEM,			ENOMEM_gc_gens)				\
-	x(ENOMEM,			ENOMEM_gc_repair_key)			\
-	x(ENOMEM,			ENOMEM_fsck_extent_ends_at)		\
-	x(ENOMEM,			ENOMEM_fsck_add_nlink)			\
-	x(ENOMEM,			ENOMEM_journal_key_insert)		\
-	x(ENOMEM,			ENOMEM_journal_keys_sort)		\
-	x(ENOMEM,			ENOMEM_read_superblock_clean)		\
-	x(ENOMEM,			ENOMEM_fs_alloc)			\
-	x(ENOMEM,			ENOMEM_fs_name_alloc)			\
-	x(ENOMEM,			ENOMEM_fs_other_alloc)			\
-	x(ENOMEM,			ENOMEM_dev_alloc)			\
-	x(ENOMEM,			ENOMEM_disk_accounting)			\
-	x(ENOSPC,			ENOSPC_disk_reservation)		\
-	x(ENOSPC,			ENOSPC_bucket_alloc)			\
-	x(ENOSPC,			ENOSPC_disk_label_add)			\
-	x(ENOSPC,			ENOSPC_stripe_create)			\
-	x(ENOSPC,			ENOSPC_inode_create)			\
-	x(ENOSPC,			ENOSPC_str_hash_create)			\
-	x(ENOSPC,			ENOSPC_snapshot_create)			\
-	x(ENOSPC,			ENOSPC_subvolume_create)		\
-	x(ENOSPC,			ENOSPC_sb)				\
-	x(ENOSPC,			ENOSPC_sb_journal)			\
-	x(ENOSPC,			ENOSPC_sb_journal_seq_blacklist)	\
-	x(ENOSPC,			ENOSPC_sb_quota)			\
-	x(ENOSPC,			ENOSPC_sb_replicas)			\
-	x(ENOSPC,			ENOSPC_sb_members)			\
-	x(ENOSPC,			ENOSPC_sb_members_v2)			\
-	x(ENOSPC,			ENOSPC_sb_crypt)			\
-	x(ENOSPC,			ENOSPC_sb_downgrade)			\
-	x(ENOSPC,			ENOSPC_btree_slot)			\
-	x(ENOSPC,			ENOSPC_snapshot_tree)			\
-	x(ENOENT,			ENOENT_bkey_type_mismatch)		\
-	x(ENOENT,			ENOENT_str_hash_lookup)			\
-	x(ENOENT,			ENOENT_str_hash_set_must_replace)	\
-	x(ENOENT,			ENOENT_inode)				\
-	x(ENOENT,			ENOENT_not_subvol)			\
-	x(ENOENT,			ENOENT_not_directory)			\
-	x(ENOENT,			ENOENT_directory_dead)			\
-	x(ENOENT,			ENOENT_subvolume)			\
-	x(ENOENT,			ENOENT_snapshot_tree)			\
-	x(ENOENT,			ENOENT_dirent_doesnt_match_inode)	\
-	x(ENOENT,			ENOENT_dev_not_found)			\
-	x(ENOENT,			ENOENT_dev_idx_not_found)		\
-	x(ENOTEMPTY,			ENOTEMPTY_dir_not_empty)		\
-	x(ENOTEMPTY,			ENOTEMPTY_subvol_not_empty)		\
-	x(0,				open_buckets_empty)			\
-	x(0,				freelist_empty)				\
-	x(BCH_ERR_freelist_empty,	no_buckets_found)			\
-	x(0,				transaction_restart)			\
-	x(BCH_ERR_transaction_restart,	transaction_restart_fault_inject)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_relock)		\
-	x(BCH_ERR_transaction_restart,	transaction_restart_relock_path)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_relock_path_intent)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_relock_after_fill)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_too_many_iters)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_lock_node_reused)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_fill_relock)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_fill_mem_alloc_fail)\
-	x(BCH_ERR_transaction_restart,	transaction_restart_mem_realloced)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_in_traverse_all)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_would_deadlock)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_would_deadlock_write)\
-	x(BCH_ERR_transaction_restart,	transaction_restart_deadlock_recursion_limit)\
-	x(BCH_ERR_transaction_restart,	transaction_restart_upgrade)		\
-	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_upgrade)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_fill)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_raced)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_realloced)\
-	x(BCH_ERR_transaction_restart,	transaction_restart_journal_preres_get)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_split_race)		\
-	x(BCH_ERR_transaction_restart,	transaction_restart_write_buffer_flush)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_nested)		\
-	x(0,				no_btree_node)				\
-	x(BCH_ERR_no_btree_node,	no_btree_node_relock)			\
-	x(BCH_ERR_no_btree_node,	no_btree_node_upgrade)			\
-	x(BCH_ERR_no_btree_node,	no_btree_node_drop)			\
-	x(BCH_ERR_no_btree_node,	no_btree_node_lock_root)		\
-	x(BCH_ERR_no_btree_node,	no_btree_node_up)			\
-	x(BCH_ERR_no_btree_node,	no_btree_node_down)			\
-	x(BCH_ERR_no_btree_node,	no_btree_node_init)			\
-	x(BCH_ERR_no_btree_node,	no_btree_node_cached)			\
-	x(BCH_ERR_no_btree_node,	no_btree_node_srcu_reset)		\
-	x(0,				btree_insert_fail)			\
-	x(BCH_ERR_btree_insert_fail,	btree_insert_btree_node_full)		\
-	x(BCH_ERR_btree_insert_fail,	btree_insert_need_mark_replicas)	\
-	x(BCH_ERR_btree_insert_fail,	btree_insert_need_journal_res)		\
-	x(BCH_ERR_btree_insert_fail,	btree_insert_need_journal_reclaim)	\
-	x(0,				backpointer_to_overwritten_btree_node)	\
-	x(0,				lock_fail_root_changed)			\
-	x(0,				journal_reclaim_would_deadlock)		\
-	x(EINVAL,			fsck)					\
-	x(BCH_ERR_fsck,			fsck_fix)				\
-	x(BCH_ERR_fsck,			fsck_ignore)				\
-	x(BCH_ERR_fsck,			fsck_errors_not_fixed)			\
-	x(BCH_ERR_fsck,			fsck_repair_unimplemented)		\
-	x(BCH_ERR_fsck,			fsck_repair_impossible)			\
-	x(0,				restart_recovery)			\
-	x(0,				data_update_done)			\
-	x(EINVAL,			device_state_not_allowed)		\
-	x(EINVAL,			member_info_missing)			\
-	x(EINVAL,			mismatched_block_size)			\
-	x(EINVAL,			block_size_too_small)			\
-	x(EINVAL,			bucket_size_too_small)			\
-	x(EINVAL,			device_size_too_small)			\
-	x(EINVAL,			device_size_too_big)			\
-	x(EINVAL,			device_not_a_member_of_filesystem)	\
-	x(EINVAL,			device_has_been_removed)		\
-	x(EINVAL,			device_splitbrain)			\
-	x(EINVAL,			device_already_online)			\
-	x(EINVAL,			insufficient_devices_to_start)		\
-	x(EINVAL,			invalid)				\
-	x(EINVAL,			internal_fsck_err)			\
-	x(EINVAL,			opt_parse_error)			\
-	x(EINVAL,			remove_with_metadata_missing_unimplemented)\
-	x(EINVAL,			remove_would_lose_data)			\
-	x(EINVAL,			btree_iter_with_journal_not_supported)	\
-	x(EROFS,			erofs_trans_commit)			\
-	x(EROFS,			erofs_no_writes)			\
-	x(EROFS,			erofs_journal_err)			\
-	x(EROFS,			erofs_sb_err)				\
-	x(EROFS,			erofs_unfixed_errors)			\
-	x(EROFS,			erofs_norecovery)			\
-	x(EROFS,			erofs_nochanges)			\
-	x(EROFS,			insufficient_devices)			\
-	x(0,				operation_blocked)			\
-	x(BCH_ERR_operation_blocked,	btree_cache_cannibalize_lock_blocked)	\
-	x(BCH_ERR_operation_blocked,	journal_res_get_blocked)		\
-	x(BCH_ERR_operation_blocked,	journal_preres_get_blocked)		\
-	x(BCH_ERR_operation_blocked,	bucket_alloc_blocked)			\
-	x(BCH_ERR_operation_blocked,	stripe_alloc_blocked)			\
-	x(BCH_ERR_invalid,		invalid_sb)				\
-	x(BCH_ERR_invalid_sb,		invalid_sb_magic)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_version)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_features)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_too_big)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_csum_type)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_csum)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_block_size)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_uuid)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_too_many_members)		\
-	x(BCH_ERR_invalid_sb,		invalid_sb_dev_idx)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_time_precision)		\
-	x(BCH_ERR_invalid_sb,		invalid_sb_field_size)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_layout)			\
-	x(BCH_ERR_invalid_sb_layout,	invalid_sb_layout_type)			\
-	x(BCH_ERR_invalid_sb_layout,	invalid_sb_layout_nr_superblocks)	\
-	x(BCH_ERR_invalid_sb_layout,	invalid_sb_layout_superblocks_overlap)	\
-	x(BCH_ERR_invalid_sb,		invalid_sb_members_missing)		\
-	x(BCH_ERR_invalid_sb,		invalid_sb_members)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_disk_groups)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_replicas)			\
-	x(BCH_ERR_invalid_sb,		invalid_replicas_entry)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_journal)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_journal_seq_blacklist)	\
-	x(BCH_ERR_invalid_sb,		invalid_sb_crypt)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_clean)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_quota)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_errors)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_opt_compression)		\
-	x(BCH_ERR_invalid_sb,		invalid_sb_ext)				\
-	x(BCH_ERR_invalid_sb,		invalid_sb_downgrade)			\
-	x(BCH_ERR_invalid,		invalid_bkey)				\
-	x(BCH_ERR_operation_blocked,    nocow_lock_blocked)			\
-	x(EIO,				btree_node_read_err)			\
-	x(EIO,				sb_not_downgraded)			\
-	x(EIO,				btree_node_write_all_failed)		\
-	x(EIO,				btree_node_read_error)			\
-	x(EIO,				btree_node_read_validate_error)		\
-	x(EIO,				btree_need_topology_repair)		\
-	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_fixable)		\
-	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_want_retry)		\
-	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_must_retry)		\
-	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_bad_node)		\
-	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_incompatible)	\
-	x(0,				nopromote)				\
-	x(BCH_ERR_nopromote,		nopromote_may_not)			\
-	x(BCH_ERR_nopromote,		nopromote_already_promoted)		\
-	x(BCH_ERR_nopromote,		nopromote_unwritten)			\
-	x(BCH_ERR_nopromote,		nopromote_congested)			\
-	x(BCH_ERR_nopromote,		nopromote_in_flight)			\
-	x(BCH_ERR_nopromote,		nopromote_no_writes)			\
-	x(BCH_ERR_nopromote,		nopromote_enomem)			\
-	x(0,				need_inode_lock)			\
-	x(0,				invalid_snapshot_node)
-
-enum bch_errcode {
-	BCH_ERR_START		= 2048,
-#define x(class, err) BCH_ERR_##err,
-	BCH_ERRCODES()
-#undef x
-	BCH_ERR_MAX
-};
-
-const char *bch2_err_str(int);
-bool __bch2_err_matches(int, int);
-
-static inline bool _bch2_err_matches(int err, int class)
-{
-	return err < 0 && __bch2_err_matches(err, class);
-}
-
-#define bch2_err_matches(_err, _class)			\
-({							\
-	BUILD_BUG_ON(!__builtin_constant_p(_class));	\
-	unlikely(_bch2_err_matches(_err, _class));	\
-})
-
-int __bch2_err_class(int);
-
-static inline long bch2_err_class(long err)
-{
-	return err < 0 ? __bch2_err_class(err) : err;
-}
-
-#define BLK_STS_REMOVED		((__force blk_status_t)128)
-
-const char *bch2_blk_status_to_str(blk_status_t);
-
-#endif /* _BCACHFES_ERRCODE_H */
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
deleted file mode 100644
index c66eeffcd7f2..000000000000
--- a/fs/bcachefs/error.c
+++ /dev/null
@@ -1,382 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "error.h"
-#include "journal.h"
-#include "recovery_passes.h"
-#include "super.h"
-#include "thread_with_file.h"
-
-#define FSCK_ERR_RATELIMIT_NR	10
-
-bool bch2_inconsistent_error(struct bch_fs *c)
-{
-	set_bit(BCH_FS_error, &c->flags);
-
-	switch (c->opts.errors) {
-	case BCH_ON_ERROR_continue:
-		return false;
-	case BCH_ON_ERROR_ro:
-		if (bch2_fs_emergency_read_only(c))
-			bch_err(c, "inconsistency detected - emergency read only at journal seq %llu",
-				journal_cur_seq(&c->journal));
-		return true;
-	case BCH_ON_ERROR_panic:
-		panic(bch2_fmt(c, "panic after error"));
-		return true;
-	default:
-		BUG();
-	}
-}
-
-int bch2_topology_error(struct bch_fs *c)
-{
-	set_bit(BCH_FS_topology_error, &c->flags);
-	if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
-		bch2_inconsistent_error(c);
-		return -BCH_ERR_btree_need_topology_repair;
-	} else {
-		return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?:
-			-BCH_ERR_btree_node_read_validate_error;
-	}
-}
-
-void bch2_fatal_error(struct bch_fs *c)
-{
-	if (bch2_fs_emergency_read_only(c))
-		bch_err(c, "fatal error - emergency read only");
-}
-
-void bch2_io_error_work(struct work_struct *work)
-{
-	struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
-	struct bch_fs *c = ca->fs;
-	bool dev;
-
-	down_write(&c->state_lock);
-	dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro,
-				    BCH_FORCE_IF_DEGRADED);
-	if (dev
-	    ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
-				  BCH_FORCE_IF_DEGRADED)
-	    : bch2_fs_emergency_read_only(c))
-		bch_err(ca,
-			"too many IO errors, setting %s RO",
-			dev ? "device" : "filesystem");
-	up_write(&c->state_lock);
-}
-
-void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type)
-{
-	atomic64_inc(&ca->errors[type]);
-	//queue_work(system_long_wq, &ca->io_error_work);
-}
-
-enum ask_yn {
-	YN_NO,
-	YN_YES,
-	YN_ALLNO,
-	YN_ALLYES,
-};
-
-static enum ask_yn parse_yn_response(char *buf)
-{
-	buf = strim(buf);
-
-	if (strlen(buf) == 1)
-		switch (buf[0]) {
-		case 'n':
-			return YN_NO;
-		case 'y':
-			return YN_YES;
-		case 'N':
-			return YN_ALLNO;
-		case 'Y':
-			return YN_ALLYES;
-		}
-	return -1;
-}
-
-#ifdef __KERNEL__
-static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
-{
-	struct stdio_redirect *stdio = c->stdio;
-
-	if (c->stdio_filter && c->stdio_filter != current)
-		stdio = NULL;
-
-	if (!stdio)
-		return YN_NO;
-
-	char buf[100];
-	int ret;
-
-	do {
-		bch2_print(c, " (y,n, or Y,N for all errors of this type) ");
-
-		int r = bch2_stdio_redirect_readline(stdio, buf, sizeof(buf) - 1);
-		if (r < 0)
-			return YN_NO;
-		buf[r] = '\0';
-	} while ((ret = parse_yn_response(buf)) < 0);
-
-	return ret;
-}
-#else
-
-#include "tools-util.h"
-
-static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
-{
-	char *buf = NULL;
-	size_t buflen = 0;
-	int ret;
-
-	do {
-		fputs(" (y,n, or Y,N for all errors of this type) ", stdout);
-		fflush(stdout);
-
-		if (getline(&buf, &buflen, stdin) < 0)
-			die("error reading from standard input");
-	} while ((ret = parse_yn_response(buf)) < 0);
-
-	free(buf);
-	return ret;
-}
-
-#endif
-
-static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
-{
-	struct fsck_err_state *s;
-
-	if (!test_bit(BCH_FS_fsck_running, &c->flags))
-		return NULL;
-
-	list_for_each_entry(s, &c->fsck_error_msgs, list)
-		if (s->fmt == fmt) {
-			/*
-			 * move it to the head of the list: repeated fsck errors
-			 * are common
-			 */
-			list_move(&s->list, &c->fsck_error_msgs);
-			return s;
-		}
-
-	s = kzalloc(sizeof(*s), GFP_NOFS);
-	if (!s) {
-		if (!c->fsck_alloc_msgs_err)
-			bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
-		c->fsck_alloc_msgs_err = true;
-		return NULL;
-	}
-
-	INIT_LIST_HEAD(&s->list);
-	s->fmt = fmt;
-	list_add(&s->list, &c->fsck_error_msgs);
-	return s;
-}
-
-/* s/fix?/fixing/ s/recreate?/recreating/ */
-static void prt_actioning(struct printbuf *out, const char *action)
-{
-	unsigned len = strlen(action);
-
-	BUG_ON(action[len - 1] != '?');
-	--len;
-
-	if (action[len - 1] == 'e')
-		--len;
-
-	prt_bytes(out, action, len);
-	prt_str(out, "ing");
-}
-
-int bch2_fsck_err(struct bch_fs *c,
-		  enum bch_fsck_flags flags,
-		  enum bch_sb_error_id err,
-		  const char *fmt, ...)
-{
-	struct fsck_err_state *s = NULL;
-	va_list args;
-	bool print = true, suppressing = false, inconsistent = false;
-	struct printbuf buf = PRINTBUF, *out = &buf;
-	int ret = -BCH_ERR_fsck_ignore;
-	const char *action_orig = "fix?", *action = action_orig;
-
-	if ((flags & FSCK_CAN_FIX) &&
-	    test_bit(err, c->sb.errors_silent))
-		return -BCH_ERR_fsck_fix;
-
-	bch2_sb_error_count(c, err);
-
-	va_start(args, fmt);
-	prt_vprintf(out, fmt, args);
-	va_end(args);
-
-	/* Custom fix/continue/recreate/etc.? */
-	if (out->buf[out->pos - 1] == '?') {
-		const char *p = strrchr(out->buf, ',');
-		if (p) {
-			out->pos = p - out->buf;
-			action = kstrdup(p + 2, GFP_KERNEL);
-			if (!action) {
-				ret = -ENOMEM;
-				goto err;
-			}
-		}
-	}
-
-	mutex_lock(&c->fsck_error_msgs_lock);
-	s = fsck_err_get(c, fmt);
-	if (s) {
-		/*
-		 * We may be called multiple times for the same error on
-		 * transaction restart - this memoizes instead of asking the user
-		 * multiple times for the same error:
-		 */
-		if (s->last_msg && !strcmp(buf.buf, s->last_msg)) {
-			ret = s->ret;
-			mutex_unlock(&c->fsck_error_msgs_lock);
-			goto err;
-		}
-
-		kfree(s->last_msg);
-		s->last_msg = kstrdup(buf.buf, GFP_KERNEL);
-		if (!s->last_msg) {
-			mutex_unlock(&c->fsck_error_msgs_lock);
-			ret = -ENOMEM;
-			goto err;
-		}
-
-		if (c->opts.ratelimit_errors &&
-		    !(flags & FSCK_NO_RATELIMIT) &&
-		    s->nr >= FSCK_ERR_RATELIMIT_NR) {
-			if (s->nr == FSCK_ERR_RATELIMIT_NR)
-				suppressing = true;
-			else
-				print = false;
-		}
-
-		s->nr++;
-	}
-
-#ifdef BCACHEFS_LOG_PREFIX
-	if (!strncmp(fmt, "bcachefs:", 9))
-		prt_printf(out, bch2_log_msg(c, ""));
-#endif
-
-	if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
-		if (c->opts.errors != BCH_ON_ERROR_continue ||
-		    !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
-			prt_str(out, ", shutting down");
-			inconsistent = true;
-			ret = -BCH_ERR_fsck_errors_not_fixed;
-		} else if (flags & FSCK_CAN_FIX) {
-			prt_str(out, ", ");
-			prt_actioning(out, action);
-			ret = -BCH_ERR_fsck_fix;
-		} else {
-			prt_str(out, ", continuing");
-			ret = -BCH_ERR_fsck_ignore;
-		}
-	} else if (c->opts.fix_errors == FSCK_FIX_exit) {
-		prt_str(out, ", exiting");
-		ret = -BCH_ERR_fsck_errors_not_fixed;
-	} else if (flags & FSCK_CAN_FIX) {
-		int fix = s && s->fix
-			? s->fix
-			: c->opts.fix_errors;
-
-		if (fix == FSCK_FIX_ask) {
-			prt_str(out, ", ");
-			prt_str(out, action);
-
-			if (bch2_fs_stdio_redirect(c))
-				bch2_print(c, "%s", out->buf);
-			else
-				bch2_print_string_as_lines(KERN_ERR, out->buf);
-			print = false;
-
-			int ask = bch2_fsck_ask_yn(c);
-
-			if (ask >= YN_ALLNO && s)
-				s->fix = ask == YN_ALLNO
-					? FSCK_FIX_no
-					: FSCK_FIX_yes;
-
-			ret = ask & 1
-				? -BCH_ERR_fsck_fix
-				: -BCH_ERR_fsck_ignore;
-		} else if (fix == FSCK_FIX_yes ||
-			   (c->opts.nochanges &&
-			    !(flags & FSCK_CAN_IGNORE))) {
-			prt_str(out, ", ");
-			prt_actioning(out, action);
-			ret = -BCH_ERR_fsck_fix;
-		} else {
-			prt_str(out, ", not ");
-			prt_actioning(out, action);
-		}
-	} else if (flags & FSCK_NEED_FSCK) {
-		prt_str(out, " (run fsck to correct)");
-	} else {
-		prt_str(out, " (repair unimplemented)");
-	}
-
-	if (ret == -BCH_ERR_fsck_ignore &&
-	    (c->opts.fix_errors == FSCK_FIX_exit ||
-	     !(flags & FSCK_CAN_IGNORE)))
-		ret = -BCH_ERR_fsck_errors_not_fixed;
-
-	if (print) {
-		if (bch2_fs_stdio_redirect(c))
-			bch2_print(c, "%s\n", out->buf);
-		else
-			bch2_print_string_as_lines(KERN_ERR, out->buf);
-	}
-
-	if (test_bit(BCH_FS_fsck_running, &c->flags) &&
-	    (ret != -BCH_ERR_fsck_fix &&
-	     ret != -BCH_ERR_fsck_ignore))
-		bch_err(c, "Unable to continue, halting");
-	else if (suppressing)
-		bch_err(c, "Ratelimiting new instances of previous error");
-
-	if (s)
-		s->ret = ret;
-
-	mutex_unlock(&c->fsck_error_msgs_lock);
-
-	if (inconsistent)
-		bch2_inconsistent_error(c);
-
-	if (ret == -BCH_ERR_fsck_fix) {
-		set_bit(BCH_FS_errors_fixed, &c->flags);
-	} else {
-		set_bit(BCH_FS_errors_not_fixed, &c->flags);
-		set_bit(BCH_FS_error, &c->flags);
-	}
-err:
-	if (action != action_orig)
-		kfree(action);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-void bch2_flush_fsck_errs(struct bch_fs *c)
-{
-	struct fsck_err_state *s, *n;
-
-	mutex_lock(&c->fsck_error_msgs_lock);
-
-	list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) {
-		if (s->ratelimited && s->last_msg)
-			bch_err(c, "Saw %llu errors like:\n    %s", s->nr, s->last_msg);
-
-		list_del(&s->list);
-		kfree(s->last_msg);
-		kfree(s);
-	}
-
-	mutex_unlock(&c->fsck_error_msgs_lock);
-}
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
deleted file mode 100644
index 36caedf72d89..000000000000
--- a/fs/bcachefs/error.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ERROR_H
-#define _BCACHEFS_ERROR_H
-
-#include <linux/list.h>
-#include <linux/printk.h>
-#include "sb-errors.h"
-
-struct bch_dev;
-struct bch_fs;
-struct work_struct;
-
-/*
- * XXX: separate out errors that indicate on disk data is inconsistent, and flag
- * superblock as such
- */
-
-/* Error messages: */
-
-/*
- * Inconsistency errors: The on disk data is inconsistent. If these occur during
- * initial recovery, they don't indicate a bug in the running code - we walk all
- * the metadata before modifying anything. If they occur at runtime, they
- * indicate either a bug in the running code or (less likely) data is being
- * silently corrupted under us.
- *
- * XXX: audit all inconsistent errors and make sure they're all recoverable, in
- * BCH_ON_ERROR_CONTINUE mode
- */
-
-bool bch2_inconsistent_error(struct bch_fs *);
-
-int bch2_topology_error(struct bch_fs *);
-
-#define bch2_fs_topology_error(c, ...)					\
-({									\
-	bch_err(c, "btree topology error: " __VA_ARGS__);		\
-	bch2_topology_error(c);						\
-})
-
-#define bch2_fs_inconsistent(c, ...)					\
-({									\
-	bch_err(c, __VA_ARGS__);					\
-	bch2_inconsistent_error(c);					\
-})
-
-#define bch2_fs_inconsistent_on(cond, c, ...)				\
-({									\
-	bool _ret = unlikely(!!(cond));					\
-									\
-	if (_ret)							\
-		bch2_fs_inconsistent(c, __VA_ARGS__);			\
-	_ret;								\
-})
-
-/*
- * Later we might want to mark only the particular device inconsistent, not the
- * entire filesystem:
- */
-
-#define bch2_dev_inconsistent(ca, ...)					\
-do {									\
-	bch_err(ca, __VA_ARGS__);					\
-	bch2_inconsistent_error((ca)->fs);				\
-} while (0)
-
-#define bch2_dev_inconsistent_on(cond, ca, ...)				\
-({									\
-	bool _ret = unlikely(!!(cond));					\
-									\
-	if (_ret)							\
-		bch2_dev_inconsistent(ca, __VA_ARGS__);			\
-	_ret;								\
-})
-
-/*
- * When a transaction update discovers or is causing a fs inconsistency, it's
- * helpful to also dump the pending updates:
- */
-#define bch2_trans_inconsistent(trans, ...)				\
-({									\
-	bch_err(trans->c, __VA_ARGS__);					\
-	bch2_dump_trans_updates(trans);					\
-	bch2_inconsistent_error(trans->c);				\
-})
-
-#define bch2_trans_inconsistent_on(cond, trans, ...)			\
-({									\
-	bool _ret = unlikely(!!(cond));					\
-									\
-	if (_ret)							\
-		bch2_trans_inconsistent(trans, __VA_ARGS__);		\
-	_ret;								\
-})
-
-/*
- * Fsck errors: inconsistency errors we detect at mount time, and should ideally
- * be able to repair:
- */
-
-struct fsck_err_state {
-	struct list_head	list;
-	const char		*fmt;
-	u64			nr;
-	bool			ratelimited;
-	int			ret;
-	int			fix;
-	char			*last_msg;
-};
-
-enum bch_fsck_flags {
-	FSCK_CAN_FIX		= 1 << 0,
-	FSCK_CAN_IGNORE		= 1 << 1,
-	FSCK_NEED_FSCK		= 1 << 2,
-	FSCK_NO_RATELIMIT	= 1 << 3,
-};
-
-#define fsck_err_count(_c, _err)	bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err)
-
-__printf(4, 5) __cold
-int bch2_fsck_err(struct bch_fs *,
-		  enum bch_fsck_flags,
-		  enum bch_sb_error_id,
-		  const char *, ...);
-void bch2_flush_fsck_errs(struct bch_fs *);
-
-#define __fsck_err(c, _flags, _err_type, ...)				\
-({									\
-	int _ret = bch2_fsck_err(c, _flags, BCH_FSCK_ERR_##_err_type,	\
-				 __VA_ARGS__);				\
-									\
-	if (_ret != -BCH_ERR_fsck_fix &&				\
-	    _ret != -BCH_ERR_fsck_ignore) {				\
-		ret = _ret;						\
-		goto fsck_err;						\
-	}								\
-									\
-	_ret == -BCH_ERR_fsck_fix;					\
-})
-
-/* These macros return true if error should be fixed: */
-
-/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
-
-#define __fsck_err_on(cond, c, _flags, _err_type, ...)			\
-	(unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false)
-
-#define need_fsck_err_on(cond, c, _err_type, ...)				\
-	__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
-
-#define need_fsck_err(c, _err_type, ...)				\
-	__fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
-
-#define mustfix_fsck_err(c, _err_type, ...)				\
-	__fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
-
-#define mustfix_fsck_err_on(cond, c, _err_type, ...)			\
-	__fsck_err_on(cond, c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
-
-#define fsck_err(c, _err_type, ...)					\
-	__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
-
-#define fsck_err_on(cond, c, _err_type, ...)				\
-	__fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
-
-__printf(4, 0)
-static inline void bch2_bkey_fsck_err(struct bch_fs *c,
-				     struct printbuf *err_msg,
-				     enum bch_sb_error_id err_type,
-				     const char *fmt, ...)
-{
-	va_list args;
-
-	va_start(args, fmt);
-	prt_vprintf(err_msg, fmt, args);
-	va_end(args);
-}
-
-#define bkey_fsck_err(c, _err_msg, _err_type, ...)			\
-do {									\
-	prt_printf(_err_msg, __VA_ARGS__);				\
-	bch2_sb_error_count(c, BCH_FSCK_ERR_##_err_type);		\
-	ret = -BCH_ERR_invalid_bkey;					\
-	goto fsck_err;							\
-} while (0)
-
-#define bkey_fsck_err_on(cond, ...)					\
-do {									\
-	if (unlikely(cond))						\
-		bkey_fsck_err(__VA_ARGS__);				\
-} while (0)
-
-/*
- * Fatal errors: these don't indicate a bug, but we can't continue running in RW
- * mode - pretty much just due to metadata IO errors:
- */
-
-void bch2_fatal_error(struct bch_fs *);
-
-#define bch2_fs_fatal_error(c, _msg, ...)				\
-do {									\
-	bch_err(c, "%s(): fatal error " _msg, __func__, ##__VA_ARGS__);	\
-	bch2_fatal_error(c);						\
-} while (0)
-
-#define bch2_fs_fatal_err_on(cond, c, ...)				\
-({									\
-	bool _ret = unlikely(!!(cond));					\
-									\
-	if (_ret)							\
-		bch2_fs_fatal_error(c, __VA_ARGS__);			\
-	_ret;								\
-})
-
-/*
- * IO errors: either recoverable metadata IO (because we have replicas), or data
- * IO - we need to log it and print out a message, but we don't (necessarily)
- * want to shut down the fs:
- */
-
-void bch2_io_error_work(struct work_struct *);
-
-/* Does the error handling without logging a message */
-void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
-
-#define bch2_dev_io_err_on(cond, ca, _type, ...)			\
-({									\
-	bool _ret = (cond);						\
-									\
-	if (_ret) {							\
-		bch_err_dev_ratelimited(ca, __VA_ARGS__);		\
-		bch2_io_error(ca, _type);				\
-	}								\
-	_ret;								\
-})
-
-#define bch2_dev_inum_io_err_on(cond, ca, _type, ...)			\
-({									\
-	bool _ret = (cond);						\
-									\
-	if (_ret) {							\
-		bch_err_inum_offset_ratelimited(ca, __VA_ARGS__);	\
-		bch2_io_error(ca, _type);				\
-	}								\
-	_ret;								\
-})
-
-#endif /* _BCACHEFS_ERROR_H */
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
deleted file mode 100644
index 5f4fecb358da..000000000000
--- a/fs/bcachefs/extent_update.c
+++ /dev/null
@@ -1,173 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "debug.h"
-#include "extents.h"
-#include "extent_update.h"
-
-/*
- * This counts the number of iterators to the alloc & ec btrees we'll need
- * inserting/removing this extent:
- */
-static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	unsigned ret = 0, lru = 0;
-
-	bkey_extent_entry_for_each(ptrs, entry) {
-		switch (__extent_entry_type(entry)) {
-		case BCH_EXTENT_ENTRY_ptr:
-			/* Might also be updating LRU btree */
-			if (entry->ptr.cached)
-				lru++;
-
-			fallthrough;
-		case BCH_EXTENT_ENTRY_stripe_ptr:
-			ret++;
-		}
-	}
-
-	/*
-	 * Updating keys in the alloc btree may also update keys in the
-	 * freespace or discard btrees:
-	 */
-	return lru + ret * 2;
-}
-
-static int count_iters_for_insert(struct btree_trans *trans,
-				  struct bkey_s_c k,
-				  unsigned offset,
-				  struct bpos *end,
-				  unsigned *nr_iters,
-				  unsigned max_iters)
-{
-	int ret = 0, ret2 = 0;
-
-	if (*nr_iters >= max_iters) {
-		*end = bpos_min(*end, k.k->p);
-		ret = 1;
-	}
-
-	switch (k.k->type) {
-	case KEY_TYPE_extent:
-	case KEY_TYPE_reflink_v:
-		*nr_iters += bch2_bkey_nr_alloc_ptrs(k);
-
-		if (*nr_iters >= max_iters) {
-			*end = bpos_min(*end, k.k->p);
-			ret = 1;
-		}
-
-		break;
-	case KEY_TYPE_reflink_p: {
-		struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
-		u64 idx = le64_to_cpu(p.v->idx);
-		unsigned sectors = bpos_min(*end, p.k->p).offset -
-			bkey_start_offset(p.k);
-		struct btree_iter iter;
-		struct bkey_s_c r_k;
-
-		for_each_btree_key_norestart(trans, iter,
-				   BTREE_ID_reflink, POS(0, idx + offset),
-				   BTREE_ITER_slots, r_k, ret2) {
-			if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors)))
-				break;
-
-			/* extent_update_to_keys(), for the reflink_v update */
-			*nr_iters += 1;
-
-			*nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);
-
-			if (*nr_iters >= max_iters) {
-				struct bpos pos = bkey_start_pos(k.k);
-				pos.offset += min_t(u64, k.k->size,
-						    r_k.k->p.offset - idx);
-
-				*end = bpos_min(*end, pos);
-				ret = 1;
-				break;
-			}
-		}
-		bch2_trans_iter_exit(trans, &iter);
-
-		break;
-	}
-	}
-
-	return ret2 ?: ret;
-}
-
-#define EXTENT_ITERS_MAX	(BTREE_ITER_INITIAL / 3)
-
-int bch2_extent_atomic_end(struct btree_trans *trans,
-			   struct btree_iter *iter,
-			   struct bkey_i *insert,
-			   struct bpos *end)
-{
-	struct btree_iter copy;
-	struct bkey_s_c k;
-	unsigned nr_iters = 0;
-	int ret;
-
-	ret = bch2_btree_iter_traverse(iter);
-	if (ret)
-		return ret;
-
-	*end = insert->k.p;
-
-	/* extent_update_to_keys(): */
-	nr_iters += 1;
-
-	ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end,
-				     &nr_iters, EXTENT_ITERS_MAX / 2);
-	if (ret < 0)
-		return ret;
-
-	bch2_trans_copy_iter(&copy, iter);
-
-	for_each_btree_key_upto_continue_norestart(copy, insert->k.p, 0, k, ret) {
-		unsigned offset = 0;
-
-		if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k)))
-			offset = bkey_start_offset(&insert->k) -
-				bkey_start_offset(k.k);
-
-		/* extent_handle_overwrites(): */
-		switch (bch2_extent_overlap(&insert->k, k.k)) {
-		case BCH_EXTENT_OVERLAP_ALL:
-		case BCH_EXTENT_OVERLAP_FRONT:
-			nr_iters += 1;
-			break;
-		case BCH_EXTENT_OVERLAP_BACK:
-		case BCH_EXTENT_OVERLAP_MIDDLE:
-			nr_iters += 2;
-			break;
-		}
-
-		ret = count_iters_for_insert(trans, k, offset, end,
-					&nr_iters, EXTENT_ITERS_MAX);
-		if (ret)
-			break;
-	}
-
-	bch2_trans_iter_exit(trans, &copy);
-	return ret < 0 ? ret : 0;
-}
-
-int bch2_extent_trim_atomic(struct btree_trans *trans,
-			    struct btree_iter *iter,
-			    struct bkey_i *k)
-{
-	struct bpos end;
-	int ret;
-
-	ret = bch2_extent_atomic_end(trans, iter, k, &end);
-	if (ret)
-		return ret;
-
-	bch2_cut_back(end, k);
-	return 0;
-}
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
deleted file mode 100644
index 6f5cf449361a..000000000000
--- a/fs/bcachefs/extent_update.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EXTENT_UPDATE_H
-#define _BCACHEFS_EXTENT_UPDATE_H
-
-#include "bcachefs.h"
-
-int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *,
-			   struct bkey_i *, struct bpos *);
-int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *,
-			    struct bkey_i *);
-
-#endif /* _BCACHEFS_EXTENT_UPDATE_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
deleted file mode 100644
index 469037929685..000000000000
--- a/fs/bcachefs/extents.c
+++ /dev/null
@@ -1,1543 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
- *
- * Code for managing the extent btree and dynamically updating the writeback
- * dirty sector count.
- */
-
-#include "bcachefs.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-#include "btree_gc.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "compress.h"
-#include "debug.h"
-#include "disk_groups.h"
-#include "error.h"
-#include "extents.h"
-#include "inode.h"
-#include "journal.h"
-#include "replicas.h"
-#include "super.h"
-#include "super-io.h"
-#include "trace.h"
-#include "util.h"
-
-static unsigned bch2_crc_field_size_max[] = {
-	[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
-	[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
-	[BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
-};
-
-static void bch2_extent_crc_pack(union bch_extent_crc *,
-				 struct bch_extent_crc_unpacked,
-				 enum bch_extent_entry_type);
-
-static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
-						   unsigned dev)
-{
-	struct bch_dev_io_failures *i;
-
-	for (i = f->devs; i < f->devs + f->nr; i++)
-		if (i->dev == dev)
-			return i;
-
-	return NULL;
-}
-
-void bch2_mark_io_failure(struct bch_io_failures *failed,
-			  struct extent_ptr_decoded *p)
-{
-	struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev);
-
-	if (!f) {
-		BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
-
-		f = &failed->devs[failed->nr++];
-		f->dev		= p->ptr.dev;
-		f->idx		= p->idx;
-		f->nr_failed	= 1;
-		f->nr_retries	= 0;
-	} else if (p->idx != f->idx) {
-		f->idx		= p->idx;
-		f->nr_failed	= 1;
-		f->nr_retries	= 0;
-	} else {
-		f->nr_failed++;
-	}
-}
-
-static inline u64 dev_latency(struct bch_fs *c, unsigned dev)
-{
-	struct bch_dev *ca = bch2_dev_rcu(c, dev);
-	return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
-}
-
-/*
- * returns true if p1 is better than p2:
- */
-static inline bool ptr_better(struct bch_fs *c,
-			      const struct extent_ptr_decoded p1,
-			      const struct extent_ptr_decoded p2)
-{
-	if (likely(!p1.idx && !p2.idx)) {
-		u64 l1 = dev_latency(c, p1.ptr.dev);
-		u64 l2 = dev_latency(c, p2.ptr.dev);
-
-		/* Pick at random, biased in favor of the faster device: */
-
-		return bch2_rand_range(l1 + l2) > l1;
-	}
-
-	if (bch2_force_reconstruct_read)
-		return p1.idx > p2.idx;
-
-	return p1.idx < p2.idx;
-}
-
-/*
- * This picks a non-stale pointer, preferably from a device other than @avoid.
- * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
- * other devices, it will still pick a pointer from avoid.
- */
-int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
-			       struct bch_io_failures *failed,
-			       struct extent_ptr_decoded *pick)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	struct bch_dev_io_failures *f;
-	int ret = 0;
-
-	if (k.k->type == KEY_TYPE_error)
-		return -EIO;
-
-	rcu_read_lock();
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		/*
-		 * Unwritten extent: no need to actually read, treat it as a
-		 * hole and return 0s:
-		 */
-		if (p.ptr.unwritten) {
-			ret = 0;
-			break;
-		}
-
-		/*
-		 * If there are any dirty pointers it's an error if we can't
-		 * read:
-		 */
-		if (!ret && !p.ptr.cached)
-			ret = -EIO;
-
-		struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
-
-		if (p.ptr.cached && (!ca || dev_ptr_stale(ca, &p.ptr)))
-			continue;
-
-		f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
-		if (f)
-			p.idx = f->nr_failed < f->nr_retries
-				? f->idx
-				: f->idx + 1;
-
-		if (!p.idx && !ca)
-			p.idx++;
-
-		if (!p.idx && p.has_ec && bch2_force_reconstruct_read)
-			p.idx++;
-
-		if (!p.idx && !bch2_dev_is_readable(ca))
-			p.idx++;
-
-		if (p.idx >= (unsigned) p.has_ec + 1)
-			continue;
-
-		if (ret > 0 && !ptr_better(c, p, *pick))
-			continue;
-
-		*pick = p;
-		ret = 1;
-	}
-	rcu_read_unlock();
-
-	return ret;
-}
-
-/* KEY_TYPE_btree_ptr: */
-
-int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k,
-			   enum bch_validate_flags flags,
-			   struct printbuf *err)
-{
-	int ret = 0;
-
-	bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, c, err,
-			 btree_ptr_val_too_big,
-			 "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
-
-	ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
-fsck_err:
-	return ret;
-}
-
-void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
-			    struct bkey_s_c k)
-{
-	bch2_bkey_ptrs_to_text(out, c, k);
-}
-
-int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
-			      enum bch_validate_flags flags,
-			      struct printbuf *err)
-{
-	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
-	int ret = 0;
-
-	bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX,
-			 c, err, btree_ptr_v2_val_too_big,
-			 "value too big (%zu > %zu)",
-			 bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
-
-	bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p),
-			 c, err, btree_ptr_v2_min_key_bad,
-			 "min_key > key");
-
-	if (flags & BCH_VALIDATE_write)
-		bkey_fsck_err_on(!bp.v->sectors_written,
-				 c, err, btree_ptr_v2_written_0,
-				 "sectors_written == 0");
-
-	ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
-fsck_err:
-	return ret;
-}
-
-void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
-			       struct bkey_s_c k)
-{
-	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
-
-	prt_printf(out, "seq %llx written %u min_key %s",
-	       le64_to_cpu(bp.v->seq),
-	       le16_to_cpu(bp.v->sectors_written),
-	       BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
-
-	bch2_bpos_to_text(out, bp.v->min_key);
-	prt_printf(out, " ");
-	bch2_bkey_ptrs_to_text(out, c, k);
-}
-
-void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
-			      unsigned big_endian, int write,
-			      struct bkey_s k)
-{
-	struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k);
-
-	compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key);
-
-	if (version < bcachefs_metadata_version_inode_btree_change &&
-	    btree_id_is_extents(btree_id) &&
-	    !bkey_eq(bp.v->min_key, POS_MIN))
-		bp.v->min_key = write
-			? bpos_nosnap_predecessor(bp.v->min_key)
-			: bpos_nosnap_successor(bp.v->min_key);
-}
-
-/* KEY_TYPE_extent: */
-
-bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
-{
-	struct bkey_ptrs   l_ptrs = bch2_bkey_ptrs(l);
-	struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r);
-	union bch_extent_entry *en_l;
-	const union bch_extent_entry *en_r;
-	struct extent_ptr_decoded lp, rp;
-	bool use_right_ptr;
-
-	en_l = l_ptrs.start;
-	en_r = r_ptrs.start;
-	while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
-		if (extent_entry_type(en_l) != extent_entry_type(en_r))
-			return false;
-
-		en_l = extent_entry_next(en_l);
-		en_r = extent_entry_next(en_r);
-	}
-
-	if (en_l < l_ptrs.end || en_r < r_ptrs.end)
-		return false;
-
-	en_l = l_ptrs.start;
-	en_r = r_ptrs.start;
-	lp.crc = bch2_extent_crc_unpack(l.k, NULL);
-	rp.crc = bch2_extent_crc_unpack(r.k, NULL);
-
-	while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) &&
-	       __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) {
-		if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size !=
-		    rp.ptr.offset + rp.crc.offset ||
-		    lp.ptr.dev			!= rp.ptr.dev ||
-		    lp.ptr.gen			!= rp.ptr.gen ||
-		    lp.ptr.unwritten		!= rp.ptr.unwritten ||
-		    lp.has_ec			!= rp.has_ec)
-			return false;
-
-		/* Extents may not straddle buckets: */
-		rcu_read_lock();
-		struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev);
-		bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr);
-		rcu_read_unlock();
-
-		if (!same_bucket)
-			return false;
-
-		if (lp.has_ec			!= rp.has_ec ||
-		    (lp.has_ec &&
-		     (lp.ec.block		!= rp.ec.block ||
-		      lp.ec.redundancy		!= rp.ec.redundancy ||
-		      lp.ec.idx			!= rp.ec.idx)))
-			return false;
-
-		if (lp.crc.compression_type	!= rp.crc.compression_type ||
-		    lp.crc.nonce		!= rp.crc.nonce)
-			return false;
-
-		if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
-		    lp.crc.uncompressed_size) {
-			/* can use left extent's crc entry */
-		} else if (lp.crc.live_size <= rp.crc.offset) {
-			/* can use right extent's crc entry */
-		} else {
-			/* check if checksums can be merged: */
-			if (lp.crc.csum_type		!= rp.crc.csum_type ||
-			    lp.crc.nonce		!= rp.crc.nonce ||
-			    crc_is_compressed(lp.crc) ||
-			    !bch2_checksum_mergeable(lp.crc.csum_type))
-				return false;
-
-			if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size ||
-			    rp.crc.offset)
-				return false;
-
-			if (lp.crc.csum_type &&
-			    lp.crc.uncompressed_size +
-			    rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
-				return false;
-		}
-
-		en_l = extent_entry_next(en_l);
-		en_r = extent_entry_next(en_r);
-	}
-
-	en_l = l_ptrs.start;
-	en_r = r_ptrs.start;
-	while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
-		if (extent_entry_is_crc(en_l)) {
-			struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
-			struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
-
-			if (crc_l.uncompressed_size + crc_r.uncompressed_size >
-			    bch2_crc_field_size_max[extent_entry_type(en_l)])
-				return false;
-		}
-
-		en_l = extent_entry_next(en_l);
-		en_r = extent_entry_next(en_r);
-	}
-
-	use_right_ptr = false;
-	en_l = l_ptrs.start;
-	en_r = r_ptrs.start;
-	while (en_l < l_ptrs.end) {
-		if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr &&
-		    use_right_ptr)
-			en_l->ptr = en_r->ptr;
-
-		if (extent_entry_is_crc(en_l)) {
-			struct bch_extent_crc_unpacked crc_l =
-				bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
-			struct bch_extent_crc_unpacked crc_r =
-				bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
-
-			use_right_ptr = false;
-
-			if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
-			    crc_l.uncompressed_size) {
-				/* can use left extent's crc entry */
-			} else if (crc_l.live_size <= crc_r.offset) {
-				/* can use right extent's crc entry */
-				crc_r.offset -= crc_l.live_size;
-				bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
-						     extent_entry_type(en_l));
-				use_right_ptr = true;
-			} else {
-				crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
-								 crc_l.csum,
-								 crc_r.csum,
-								 crc_r.uncompressed_size << 9);
-
-				crc_l.uncompressed_size	+= crc_r.uncompressed_size;
-				crc_l.compressed_size	+= crc_r.compressed_size;
-				bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
-						     extent_entry_type(en_l));
-			}
-		}
-
-		en_l = extent_entry_next(en_l);
-		en_r = extent_entry_next(en_r);
-	}
-
-	bch2_key_resize(l.k, l.k->size + r.k->size);
-	return true;
-}
-
-/* KEY_TYPE_reservation: */
-
-int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k,
-			     enum bch_validate_flags flags,
-			     struct printbuf *err)
-{
-	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
-	int ret = 0;
-
-	bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, c, err,
-			 reservation_key_nr_replicas_invalid,
-			 "invalid nr_replicas (%u)", r.v->nr_replicas);
-fsck_err:
-	return ret;
-}
-
-void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
-			      struct bkey_s_c k)
-{
-	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
-
-	prt_printf(out, "generation %u replicas %u",
-	       le32_to_cpu(r.v->generation),
-	       r.v->nr_replicas);
-}
-
-bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
-{
-	struct bkey_s_reservation l = bkey_s_to_reservation(_l);
-	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r);
-
-	if (l.v->generation != r.v->generation ||
-	    l.v->nr_replicas != r.v->nr_replicas)
-		return false;
-
-	bch2_key_resize(l.k, l.k->size + r.k->size);
-	return true;
-}
-
-/* Extent checksum entries: */
-
-/* returns true if not equal */
-static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
-					 struct bch_extent_crc_unpacked r)
-{
-	return (l.csum_type		!= r.csum_type ||
-		l.compression_type	!= r.compression_type ||
-		l.compressed_size	!= r.compressed_size ||
-		l.uncompressed_size	!= r.uncompressed_size ||
-		l.offset		!= r.offset ||
-		l.live_size		!= r.live_size ||
-		l.nonce			!= r.nonce ||
-		bch2_crc_cmp(l.csum, r.csum));
-}
-
-static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
-				  struct bch_extent_crc_unpacked n)
-{
-	return !crc_is_compressed(u) &&
-		u.csum_type &&
-		u.uncompressed_size > u.live_size &&
-		bch2_csum_type_is_encryption(u.csum_type) ==
-		bch2_csum_type_is_encryption(n.csum_type);
-}
-
-bool bch2_can_narrow_extent_crcs(struct bkey_s_c k,
-				 struct bch_extent_crc_unpacked n)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	struct bch_extent_crc_unpacked crc;
-	const union bch_extent_entry *i;
-
-	if (!n.csum_type)
-		return false;
-
-	bkey_for_each_crc(k.k, ptrs, crc, i)
-		if (can_narrow_crc(crc, n))
-			return true;
-
-	return false;
-}
-
-/*
- * We're writing another replica for this extent, so while we've got the data in
- * memory we'll be computing a new checksum for the currently live data.
- *
- * If there are other replicas we aren't moving, and they are checksummed but
- * not compressed, we can modify them to point to only the data that is
- * currently live (so that readers won't have to bounce) while we've got the
- * checksum we need:
- */
-bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
-{
-	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
-	struct bch_extent_crc_unpacked u;
-	struct extent_ptr_decoded p;
-	union bch_extent_entry *i;
-	bool ret = false;
-
-	/* Find a checksum entry that covers only live data: */
-	if (!n.csum_type) {
-		bkey_for_each_crc(&k->k, ptrs, u, i)
-			if (!crc_is_compressed(u) &&
-			    u.csum_type &&
-			    u.live_size == u.uncompressed_size) {
-				n = u;
-				goto found;
-			}
-		return false;
-	}
-found:
-	BUG_ON(crc_is_compressed(n));
-	BUG_ON(n.offset);
-	BUG_ON(n.live_size != k->k.size);
-
-restart_narrow_pointers:
-	ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
-
-	bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
-		if (can_narrow_crc(p.crc, n)) {
-			bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr);
-			p.ptr.offset += p.crc.offset;
-			p.crc = n;
-			bch2_extent_ptr_decoded_append(k, &p);
-			ret = true;
-			goto restart_narrow_pointers;
-		}
-
-	return ret;
-}
-
-static void bch2_extent_crc_pack(union bch_extent_crc *dst,
-				 struct bch_extent_crc_unpacked src,
-				 enum bch_extent_entry_type type)
-{
-#define set_common_fields(_dst, _src)					\
-		_dst.type		= 1 << type;			\
-		_dst.csum_type		= _src.csum_type,		\
-		_dst.compression_type	= _src.compression_type,	\
-		_dst._compressed_size	= _src.compressed_size - 1,	\
-		_dst._uncompressed_size	= _src.uncompressed_size - 1,	\
-		_dst.offset		= _src.offset
-
-	switch (type) {
-	case BCH_EXTENT_ENTRY_crc32:
-		set_common_fields(dst->crc32, src);
-		dst->crc32.csum		= (u32 __force) *((__le32 *) &src.csum.lo);
-		break;
-	case BCH_EXTENT_ENTRY_crc64:
-		set_common_fields(dst->crc64, src);
-		dst->crc64.nonce	= src.nonce;
-		dst->crc64.csum_lo	= (u64 __force) src.csum.lo;
-		dst->crc64.csum_hi	= (u64 __force) *((__le16 *) &src.csum.hi);
-		break;
-	case BCH_EXTENT_ENTRY_crc128:
-		set_common_fields(dst->crc128, src);
-		dst->crc128.nonce	= src.nonce;
-		dst->crc128.csum	= src.csum;
-		break;
-	default:
-		BUG();
-	}
-#undef set_common_fields
-}
-
-void bch2_extent_crc_append(struct bkey_i *k,
-			    struct bch_extent_crc_unpacked new)
-{
-	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
-	union bch_extent_crc *crc = (void *) ptrs.end;
-	enum bch_extent_entry_type type;
-
-	if (bch_crc_bytes[new.csum_type]	<= 4 &&
-	    new.uncompressed_size		<= CRC32_SIZE_MAX &&
-	    new.nonce				<= CRC32_NONCE_MAX)
-		type = BCH_EXTENT_ENTRY_crc32;
-	else if (bch_crc_bytes[new.csum_type]	<= 10 &&
-		   new.uncompressed_size	<= CRC64_SIZE_MAX &&
-		   new.nonce			<= CRC64_NONCE_MAX)
-		type = BCH_EXTENT_ENTRY_crc64;
-	else if (bch_crc_bytes[new.csum_type]	<= 16 &&
-		   new.uncompressed_size	<= CRC128_SIZE_MAX &&
-		   new.nonce			<= CRC128_NONCE_MAX)
-		type = BCH_EXTENT_ENTRY_crc128;
-	else
-		BUG();
-
-	bch2_extent_crc_pack(crc, new, type);
-
-	k->k.u64s += extent_entry_u64s(ptrs.end);
-
-	EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
-}
-
-/* Generic code for keys with pointers: */
-
-unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
-{
-	return bch2_bkey_devs(k).nr;
-}
-
-unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
-{
-	return k.k->type == KEY_TYPE_reservation
-		? bkey_s_c_to_reservation(k).v->nr_replicas
-		: bch2_bkey_dirty_devs(k).nr;
-}
-
-unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)
-{
-	unsigned ret = 0;
-
-	if (k.k->type == KEY_TYPE_reservation) {
-		ret = bkey_s_c_to_reservation(k).v->nr_replicas;
-	} else {
-		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-		const union bch_extent_entry *entry;
-		struct extent_ptr_decoded p;
-
-		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-			ret += !p.ptr.cached && !crc_is_compressed(p.crc);
-	}
-
-	return ret;
-}
-
-unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	unsigned ret = 0;
-
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-		if (!p.ptr.cached && crc_is_compressed(p.crc))
-			ret += p.crc.compressed_size;
-
-	return ret;
-}
-
-bool bch2_bkey_is_incompressible(struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct bch_extent_crc_unpacked crc;
-
-	bkey_for_each_crc(k.k, ptrs, crc, entry)
-		if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
-			return true;
-	return false;
-}
-
-unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p = { 0 };
-	unsigned replicas = 0;
-
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		if (p.ptr.cached)
-			continue;
-
-		if (p.has_ec)
-			replicas += p.ec.redundancy;
-
-		replicas++;
-
-	}
-
-	return replicas;
-}
-
-static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p)
-{
-	if (p->ptr.cached)
-		return 0;
-
-	return p->has_ec
-		? p->ec.redundancy + 1
-		: ca->mi.durability;
-}
-
-unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
-{
-	struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
-
-	return ca ? __extent_ptr_durability(ca, p) : 0;
-}
-
-unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
-{
-	struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
-
-	if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed)
-		return 0;
-
-	return __extent_ptr_durability(ca, p);
-}
-
-unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	unsigned durability = 0;
-
-	rcu_read_lock();
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-		durability += bch2_extent_ptr_durability(c, &p);
-	rcu_read_unlock();
-
-	return durability;
-}
-
-static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	unsigned durability = 0;
-
-	rcu_read_lock();
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-		if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
-			durability += bch2_extent_ptr_durability(c, &p);
-	rcu_read_unlock();
-
-	return durability;
-}
-
-void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
-{
-	union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
-	union bch_extent_entry *next = extent_entry_next(entry);
-
-	memmove_u64s(entry, next, (u64 *) end - (u64 *) next);
-	k->k.u64s -= extent_entry_u64s(entry);
-}
-
-void bch2_extent_ptr_decoded_append(struct bkey_i *k,
-				    struct extent_ptr_decoded *p)
-{
-	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
-	struct bch_extent_crc_unpacked crc =
-		bch2_extent_crc_unpack(&k->k, NULL);
-	union bch_extent_entry *pos;
-
-	if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
-		pos = ptrs.start;
-		goto found;
-	}
-
-	bkey_for_each_crc(&k->k, ptrs, crc, pos)
-		if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
-			pos = extent_entry_next(pos);
-			goto found;
-		}
-
-	bch2_extent_crc_append(k, p->crc);
-	pos = bkey_val_end(bkey_i_to_s(k));
-found:
-	p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
-	__extent_entry_insert(k, pos, to_entry(&p->ptr));
-
-	if (p->has_ec) {
-		p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
-		__extent_entry_insert(k, pos, to_entry(&p->ec));
-	}
-}
-
-static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
-					  union bch_extent_entry *entry)
-{
-	union bch_extent_entry *i = ptrs.start;
-
-	if (i == entry)
-		return NULL;
-
-	while (extent_entry_next(i) != entry)
-		i = extent_entry_next(i);
-	return i;
-}
-
-/*
- * Returns pointer to the next entry after the one being dropped:
- */
-union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k,
-						   struct bch_extent_ptr *ptr)
-{
-	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
-	union bch_extent_entry *entry = to_entry(ptr), *next;
-	union bch_extent_entry *ret = entry;
-	bool drop_crc = true;
-
-	EBUG_ON(ptr < &ptrs.start->ptr ||
-		ptr >= &ptrs.end->ptr);
-	EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
-
-	for (next = extent_entry_next(entry);
-	     next != ptrs.end;
-	     next = extent_entry_next(next)) {
-		if (extent_entry_is_crc(next)) {
-			break;
-		} else if (extent_entry_is_ptr(next)) {
-			drop_crc = false;
-			break;
-		}
-	}
-
-	extent_entry_drop(k, entry);
-
-	while ((entry = extent_entry_prev(ptrs, entry))) {
-		if (extent_entry_is_ptr(entry))
-			break;
-
-		if ((extent_entry_is_crc(entry) && drop_crc) ||
-		    extent_entry_is_stripe_ptr(entry)) {
-			ret = (void *) ret - extent_entry_bytes(entry);
-			extent_entry_drop(k, entry);
-		}
-	}
-
-	return ret;
-}
-
-union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
-					   struct bch_extent_ptr *ptr)
-{
-	bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
-	union bch_extent_entry *ret =
-		bch2_bkey_drop_ptr_noerror(k, ptr);
-
-	/*
-	 * If we deleted all the dirty pointers and there's still cached
-	 * pointers, we could set the cached pointers to dirty if they're not
-	 * stale - but to do that correctly we'd need to grab an open_bucket
-	 * reference so that we don't race with bucket reuse:
-	 */
-	if (have_dirty &&
-	    !bch2_bkey_dirty_devs(k.s_c).nr) {
-		k.k->type = KEY_TYPE_error;
-		set_bkey_val_u64s(k.k, 0);
-		ret = NULL;
-	} else if (!bch2_bkey_nr_ptrs(k.s_c)) {
-		k.k->type = KEY_TYPE_deleted;
-		set_bkey_val_u64s(k.k, 0);
-		ret = NULL;
-	}
-
-	return ret;
-}
-
-void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
-{
-	bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
-}
-
-void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
-{
-	struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev);
-
-	if (ptr)
-		bch2_bkey_drop_ptr_noerror(k, ptr);
-}
-
-const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-	bkey_for_each_ptr(ptrs, ptr)
-		if (ptr->dev == dev)
-			return ptr;
-
-	return NULL;
-}
-
-bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	struct bch_dev *ca;
-	bool ret = false;
-
-	rcu_read_lock();
-	bkey_for_each_ptr(ptrs, ptr)
-		if (bch2_dev_in_target(c, ptr->dev, target) &&
-		    (ca = bch2_dev_rcu(c, ptr->dev)) &&
-		    (!ptr->cached ||
-		     !dev_ptr_stale_rcu(ca, ptr))) {
-			ret = true;
-			break;
-		}
-	rcu_read_unlock();
-
-	return ret;
-}
-
-bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
-			   struct bch_extent_ptr m, u64 offset)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-		if (p.ptr.dev	== m.dev &&
-		    p.ptr.gen	== m.gen &&
-		    (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
-		    (s64) m.offset  - offset)
-			return true;
-
-	return false;
-}
-
-/*
- * Returns true if two extents refer to the same data:
- */
-bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
-{
-	if (k1.k->type != k2.k->type)
-		return false;
-
-	if (bkey_extent_is_direct_data(k1.k)) {
-		struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
-		struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
-		const union bch_extent_entry *entry1, *entry2;
-		struct extent_ptr_decoded p1, p2;
-
-		if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2))
-			return false;
-
-		bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
-			bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
-				if (p1.ptr.dev		== p2.ptr.dev &&
-				    p1.ptr.gen		== p2.ptr.gen &&
-				    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
-				    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
-					return true;
-
-		return false;
-	} else {
-		/* KEY_TYPE_deleted, etc. */
-		return true;
-	}
-}
-
-struct bch_extent_ptr *
-bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2)
-{
-	struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2);
-	union bch_extent_entry *entry2;
-	struct extent_ptr_decoded p2;
-
-	bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
-		if (p1.ptr.dev		== p2.ptr.dev &&
-		    p1.ptr.gen		== p2.ptr.gen &&
-		    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
-		    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
-			return &entry2->ptr;
-
-	return NULL;
-}
-
-void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr)
-{
-	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
-	union bch_extent_entry *entry;
-	union bch_extent_entry *ec = NULL;
-
-	bkey_extent_entry_for_each(ptrs, entry) {
-		if (&entry->ptr == ptr) {
-			ptr->cached = true;
-			if (ec)
-				extent_entry_drop(k, ec);
-			return;
-		}
-
-		if (extent_entry_is_stripe_ptr(entry))
-			ec = entry;
-		else if (extent_entry_is_ptr(entry))
-			ec = NULL;
-	}
-
-	BUG();
-}
-
-/*
- * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
- *
- * Returns true if @k should be dropped entirely
- *
- * For existing keys, only called when btree nodes are being rewritten, not when
- * they're merely being compacted/resorted in memory.
- */
-bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
-{
-	struct bch_dev *ca;
-
-	rcu_read_lock();
-	bch2_bkey_drop_ptrs(k, ptr,
-		ptr->cached &&
-		(ca = bch2_dev_rcu(c, ptr->dev)) &&
-		dev_ptr_stale_rcu(ca, ptr));
-	rcu_read_unlock();
-
-	return bkey_deleted(k.k);
-}
-
-void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
-{
-	out->atomic++;
-	rcu_read_lock();
-	struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
-	if (!ca) {
-		prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
-			   (u64) ptr->offset, ptr->gen,
-			   ptr->cached ? " cached" : "");
-	} else {
-		u32 offset;
-		u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
-
-		prt_printf(out, "ptr: %u:%llu:%u gen %u",
-			   ptr->dev, b, offset, ptr->gen);
-		if (ptr->cached)
-			prt_str(out, " cached");
-		if (ptr->unwritten)
-			prt_str(out, " unwritten");
-		if (bucket_valid(ca, b) && dev_ptr_stale_rcu(ca, ptr))
-			prt_printf(out, " stale");
-	}
-	rcu_read_unlock();
-	--out->atomic;
-}
-
-void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
-			    struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	bool first = true;
-
-	if (c)
-		prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k));
-
-	bkey_extent_entry_for_each(ptrs, entry) {
-		if (!first)
-			prt_printf(out, " ");
-
-		switch (__extent_entry_type(entry)) {
-		case BCH_EXTENT_ENTRY_ptr:
-			bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry));
-			break;
-
-		case BCH_EXTENT_ENTRY_crc32:
-		case BCH_EXTENT_ENTRY_crc64:
-		case BCH_EXTENT_ENTRY_crc128: {
-			struct bch_extent_crc_unpacked crc =
-				bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
-
-			prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ",
-			       crc.compressed_size,
-			       crc.uncompressed_size,
-			       crc.offset, crc.nonce);
-			bch2_prt_csum_type(out, crc.csum_type);
-			prt_str(out, " compress ");
-			bch2_prt_compression_type(out, crc.compression_type);
-			break;
-		}
-		case BCH_EXTENT_ENTRY_stripe_ptr: {
-			const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr;
-
-			prt_printf(out, "ec: idx %llu block %u",
-			       (u64) ec->idx, ec->block);
-			break;
-		}
-		case BCH_EXTENT_ENTRY_rebalance: {
-			const struct bch_extent_rebalance *r = &entry->rebalance;
-
-			prt_str(out, "rebalance: target ");
-			if (c)
-				bch2_target_to_text(out, c, r->target);
-			else
-				prt_printf(out, "%u", r->target);
-			prt_str(out, " compression ");
-			bch2_compression_opt_to_text(out, r->compression);
-			break;
-		}
-		default:
-			prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
-			return;
-		}
-
-		first = false;
-	}
-}
-
-static int extent_ptr_invalid(struct bch_fs *c,
-			      struct bkey_s_c k,
-			      enum bch_validate_flags flags,
-			      const struct bch_extent_ptr *ptr,
-			      unsigned size_ondisk,
-			      bool metadata,
-			      struct printbuf *err)
-{
-	int ret = 0;
-
-	rcu_read_lock();
-	struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
-	if (!ca) {
-		rcu_read_unlock();
-		return 0;
-	}
-	u32 bucket_offset;
-	u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
-	unsigned first_bucket	= ca->mi.first_bucket;
-	u64 nbuckets		= ca->mi.nbuckets;
-	unsigned bucket_size	= ca->mi.bucket_size;
-	rcu_read_unlock();
-
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	bkey_for_each_ptr(ptrs, ptr2)
-		bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err,
-				 ptr_to_duplicate_device,
-				 "multiple pointers to same device (%u)", ptr->dev);
-
-
-	bkey_fsck_err_on(bucket >= nbuckets, c, err,
-			 ptr_after_last_bucket,
-			 "pointer past last bucket (%llu > %llu)", bucket, nbuckets);
-	bkey_fsck_err_on(bucket < first_bucket, c, err,
-			 ptr_before_first_bucket,
-			 "pointer before first bucket (%llu < %u)", bucket, first_bucket);
-	bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size, c, err,
-			 ptr_spans_multiple_buckets,
-			 "pointer spans multiple buckets (%u + %u > %u)",
-		       bucket_offset, size_ondisk, bucket_size);
-fsck_err:
-	return ret;
-}
-
-int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
-			   enum bch_validate_flags flags,
-			   struct printbuf *err)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct bch_extent_crc_unpacked crc;
-	unsigned size_ondisk = k.k->size;
-	unsigned nonce = UINT_MAX;
-	unsigned nr_ptrs = 0;
-	bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false;
-	int ret = 0;
-
-	if (bkey_is_btree_ptr(k.k))
-		size_ondisk = btree_sectors(c);
-
-	bkey_extent_entry_for_each(ptrs, entry) {
-		bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, c, err,
-			extent_ptrs_invalid_entry,
-			"invalid extent entry type (got %u, max %u)",
-			__extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
-
-		bkey_fsck_err_on(bkey_is_btree_ptr(k.k) &&
-				 !extent_entry_is_ptr(entry), c, err,
-				 btree_ptr_has_non_ptr,
-				 "has non ptr field");
-
-		switch (extent_entry_type(entry)) {
-		case BCH_EXTENT_ENTRY_ptr:
-			ret = extent_ptr_invalid(c, k, flags, &entry->ptr,
-						 size_ondisk, false, err);
-			if (ret)
-				return ret;
-
-			bkey_fsck_err_on(entry->ptr.cached && have_ec, c, err,
-					 ptr_cached_and_erasure_coded,
-					 "cached, erasure coded ptr");
-
-			if (!entry->ptr.unwritten)
-				have_written = true;
-			else
-				have_unwritten = true;
-
-			have_ec = false;
-			crc_since_last_ptr = false;
-			nr_ptrs++;
-			break;
-		case BCH_EXTENT_ENTRY_crc32:
-		case BCH_EXTENT_ENTRY_crc64:
-		case BCH_EXTENT_ENTRY_crc128:
-			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
-
-			bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, c, err,
-					 ptr_crc_uncompressed_size_too_small,
-					 "checksum offset + key size > uncompressed size");
-			bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), c, err,
-					 ptr_crc_csum_type_unknown,
-					 "invalid checksum type");
-			bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, c, err,
-					 ptr_crc_compression_type_unknown,
-					 "invalid compression type");
-
-			if (bch2_csum_type_is_encryption(crc.csum_type)) {
-				if (nonce == UINT_MAX)
-					nonce = crc.offset + crc.nonce;
-				else if (nonce != crc.offset + crc.nonce)
-					bkey_fsck_err(c, err, ptr_crc_nonce_mismatch,
-						      "incorrect nonce");
-			}
-
-			bkey_fsck_err_on(crc_since_last_ptr, c, err,
-					 ptr_crc_redundant,
-					 "redundant crc entry");
-			crc_since_last_ptr = true;
-
-			bkey_fsck_err_on(crc_is_encoded(crc) &&
-					 (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
-					 (flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), c, err,
-					 ptr_crc_uncompressed_size_too_big,
-					 "too large encoded extent");
-
-			size_ondisk = crc.compressed_size;
-			break;
-		case BCH_EXTENT_ENTRY_stripe_ptr:
-			bkey_fsck_err_on(have_ec, c, err,
-					 ptr_stripe_redundant,
-					 "redundant stripe entry");
-			have_ec = true;
-			break;
-		case BCH_EXTENT_ENTRY_rebalance: {
-			const struct bch_extent_rebalance *r = &entry->rebalance;
-
-			if (!bch2_compression_opt_valid(r->compression)) {
-				struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
-				prt_printf(err, "invalid compression opt %u:%u",
-					   opt.type, opt.level);
-				return -BCH_ERR_invalid_bkey;
-			}
-			break;
-		}
-		}
-	}
-
-	bkey_fsck_err_on(!nr_ptrs, c, err,
-			 extent_ptrs_no_ptrs,
-			 "no ptrs");
-	bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, c, err,
-			 extent_ptrs_too_many_ptrs,
-			 "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX);
-	bkey_fsck_err_on(have_written && have_unwritten, c, err,
-			 extent_ptrs_written_and_unwritten,
-			 "extent with unwritten and written ptrs");
-	bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, c, err,
-			 extent_ptrs_unwritten,
-			 "has unwritten ptrs");
-	bkey_fsck_err_on(crc_since_last_ptr, c, err,
-			 extent_ptrs_redundant_crc,
-			 "redundant crc entry");
-	bkey_fsck_err_on(have_ec, c, err,
-			 extent_ptrs_redundant_stripe,
-			 "redundant stripe entry");
-fsck_err:
-	return ret;
-}
-
-void bch2_ptr_swab(struct bkey_s k)
-{
-	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
-	union bch_extent_entry *entry;
-	u64 *d;
-
-	for (d =  (u64 *) ptrs.start;
-	     d != (u64 *) ptrs.end;
-	     d++)
-		*d = swab64(*d);
-
-	for (entry = ptrs.start;
-	     entry < ptrs.end;
-	     entry = extent_entry_next(entry)) {
-		switch (extent_entry_type(entry)) {
-		case BCH_EXTENT_ENTRY_ptr:
-			break;
-		case BCH_EXTENT_ENTRY_crc32:
-			entry->crc32.csum = swab32(entry->crc32.csum);
-			break;
-		case BCH_EXTENT_ENTRY_crc64:
-			entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
-			entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
-			break;
-		case BCH_EXTENT_ENTRY_crc128:
-			entry->crc128.csum.hi = (__force __le64)
-				swab64((__force u64) entry->crc128.csum.hi);
-			entry->crc128.csum.lo = (__force __le64)
-				swab64((__force u64) entry->crc128.csum.lo);
-			break;
-		case BCH_EXTENT_ENTRY_stripe_ptr:
-			break;
-		case BCH_EXTENT_ENTRY_rebalance:
-			break;
-		}
-	}
-}
-
-const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-
-	bkey_extent_entry_for_each(ptrs, entry)
-		if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
-			return &entry->rebalance;
-
-	return NULL;
-}
-
-unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
-				       unsigned target, unsigned compression)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	unsigned rewrite_ptrs = 0;
-
-	if (compression) {
-		unsigned compression_type = bch2_compression_opt_to_type(compression);
-		const union bch_extent_entry *entry;
-		struct extent_ptr_decoded p;
-		unsigned i = 0;
-
-		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-			if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
-			    p.ptr.unwritten) {
-				rewrite_ptrs = 0;
-				goto incompressible;
-			}
-
-			if (!p.ptr.cached && p.crc.compression_type != compression_type)
-				rewrite_ptrs |= 1U << i;
-			i++;
-		}
-	}
-incompressible:
-	if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
-		unsigned i = 0;
-
-		bkey_for_each_ptr(ptrs, ptr) {
-			if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target))
-				rewrite_ptrs |= 1U << i;
-			i++;
-		}
-	}
-
-	return rewrite_ptrs;
-}
-
-bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
-{
-	const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
-
-	/*
-	 * If it's an indirect extent, we don't delete the rebalance entry when
-	 * done so that we know what options were applied - check if it still
-	 * needs work done:
-	 */
-	if (r &&
-	    k.k->type == KEY_TYPE_reflink_v &&
-	    !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression))
-		r = NULL;
-
-	return r != NULL;
-}
-
-int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
-				  struct bch_io_opts *opts)
-{
-	struct bkey_s k = bkey_i_to_s(_k);
-	struct bch_extent_rebalance *r;
-	unsigned target = opts->background_target;
-	unsigned compression = background_compression(*opts);
-	bool needs_rebalance;
-
-	if (!bkey_extent_is_direct_data(k.k))
-		return 0;
-
-	/* get existing rebalance entry: */
-	r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
-	if (r) {
-		if (k.k->type == KEY_TYPE_reflink_v) {
-			/*
-			 * indirect extents: existing options take precedence,
-			 * so that we don't move extents back and forth if
-			 * they're referenced by different inodes with different
-			 * options:
-			 */
-			if (r->target)
-				target = r->target;
-			if (r->compression)
-				compression = r->compression;
-		}
-
-		r->target	= target;
-		r->compression	= compression;
-	}
-
-	needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression);
-
-	if (needs_rebalance && !r) {
-		union bch_extent_entry *new = bkey_val_end(k);
-
-		new->rebalance.type		= 1U << BCH_EXTENT_ENTRY_rebalance;
-		new->rebalance.compression	= compression;
-		new->rebalance.target		= target;
-		new->rebalance.unused		= 0;
-		k.k->u64s += extent_entry_u64s(new);
-	} else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) {
-		/*
-		 * For indirect extents, don't delete the rebalance entry when
-		 * we're finished so that we know we specifically moved it or
-		 * compressed it to its current location/compression type
-		 */
-		extent_entry_drop(k, (union bch_extent_entry *) r);
-	}
-
-	return 0;
-}
-
-/* Generic extent code: */
-
-int bch2_cut_front_s(struct bpos where, struct bkey_s k)
-{
-	unsigned new_val_u64s = bkey_val_u64s(k.k);
-	int val_u64s_delta;
-	u64 sub;
-
-	if (bkey_le(where, bkey_start_pos(k.k)))
-		return 0;
-
-	EBUG_ON(bkey_gt(where, k.k->p));
-
-	sub = where.offset - bkey_start_offset(k.k);
-
-	k.k->size -= sub;
-
-	if (!k.k->size) {
-		k.k->type = KEY_TYPE_deleted;
-		new_val_u64s = 0;
-	}
-
-	switch (k.k->type) {
-	case KEY_TYPE_extent:
-	case KEY_TYPE_reflink_v: {
-		struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
-		union bch_extent_entry *entry;
-		bool seen_crc = false;
-
-		bkey_extent_entry_for_each(ptrs, entry) {
-			switch (extent_entry_type(entry)) {
-			case BCH_EXTENT_ENTRY_ptr:
-				if (!seen_crc)
-					entry->ptr.offset += sub;
-				break;
-			case BCH_EXTENT_ENTRY_crc32:
-				entry->crc32.offset += sub;
-				break;
-			case BCH_EXTENT_ENTRY_crc64:
-				entry->crc64.offset += sub;
-				break;
-			case BCH_EXTENT_ENTRY_crc128:
-				entry->crc128.offset += sub;
-				break;
-			case BCH_EXTENT_ENTRY_stripe_ptr:
-				break;
-			case BCH_EXTENT_ENTRY_rebalance:
-				break;
-			}
-
-			if (extent_entry_is_crc(entry))
-				seen_crc = true;
-		}
-
-		break;
-	}
-	case KEY_TYPE_reflink_p: {
-		struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
-
-		le64_add_cpu(&p.v->idx, sub);
-		break;
-	}
-	case KEY_TYPE_inline_data:
-	case KEY_TYPE_indirect_inline_data: {
-		void *p = bkey_inline_data_p(k);
-		unsigned bytes = bkey_inline_data_bytes(k.k);
-
-		sub = min_t(u64, sub << 9, bytes);
-
-		memmove(p, p + sub, bytes - sub);
-
-		new_val_u64s -= sub >> 3;
-		break;
-	}
-	}
-
-	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
-	BUG_ON(val_u64s_delta < 0);
-
-	set_bkey_val_u64s(k.k, new_val_u64s);
-	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
-	return -val_u64s_delta;
-}
-
-int bch2_cut_back_s(struct bpos where, struct bkey_s k)
-{
-	unsigned new_val_u64s = bkey_val_u64s(k.k);
-	int val_u64s_delta;
-	u64 len = 0;
-
-	if (bkey_ge(where, k.k->p))
-		return 0;
-
-	EBUG_ON(bkey_lt(where, bkey_start_pos(k.k)));
-
-	len = where.offset - bkey_start_offset(k.k);
-
-	k.k->p.offset = where.offset;
-	k.k->size = len;
-
-	if (!len) {
-		k.k->type = KEY_TYPE_deleted;
-		new_val_u64s = 0;
-	}
-
-	switch (k.k->type) {
-	case KEY_TYPE_inline_data:
-	case KEY_TYPE_indirect_inline_data:
-		new_val_u64s = (bkey_inline_data_offset(k.k) +
-				min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3;
-		break;
-	}
-
-	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
-	BUG_ON(val_u64s_delta < 0);
-
-	set_bkey_val_u64s(k.k, new_val_u64s);
-	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
-	return -val_u64s_delta;
-}
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
deleted file mode 100644
index 1ade959652b2..000000000000
--- a/fs/bcachefs/extents.h
+++ /dev/null
@@ -1,739 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EXTENTS_H
-#define _BCACHEFS_EXTENTS_H
-
-#include "bcachefs.h"
-#include "bkey.h"
-#include "extents_types.h"
-
-struct bch_fs;
-struct btree_trans;
-enum bch_validate_flags;
-
-/* extent entries: */
-
-#define extent_entry_last(_e)						\
-	((typeof(&(_e).v->start[0])) bkey_val_end(_e))
-
-#define entry_to_ptr(_entry)						\
-({									\
-	EBUG_ON((_entry) && !extent_entry_is_ptr(_entry));		\
-									\
-	__builtin_choose_expr(						\
-		type_is_exact(_entry, const union bch_extent_entry *),	\
-		(const struct bch_extent_ptr *) (_entry),		\
-		(struct bch_extent_ptr *) (_entry));			\
-})
-
-/* downcast, preserves const */
-#define to_entry(_entry)						\
-({									\
-	BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) &&	\
-		     !type_is(_entry, struct bch_extent_ptr *) &&	\
-		     !type_is(_entry, struct bch_extent_stripe_ptr *));	\
-									\
-	__builtin_choose_expr(						\
-		(type_is_exact(_entry, const union bch_extent_crc *) ||	\
-		 type_is_exact(_entry, const struct bch_extent_ptr *) ||\
-		 type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\
-		(const union bch_extent_entry *) (_entry),		\
-		(union bch_extent_entry *) (_entry));			\
-})
-
-#define extent_entry_next(_entry)					\
-	((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
-
-#define extent_entry_next_safe(_entry, _end)				\
-	(likely(__extent_entry_type(_entry) < BCH_EXTENT_ENTRY_MAX)	\
-	 ? extent_entry_next(_entry)					\
-	 : _end)
-
-static inline unsigned
-__extent_entry_type(const union bch_extent_entry *e)
-{
-	return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX;
-}
-
-static inline enum bch_extent_entry_type
-extent_entry_type(const union bch_extent_entry *e)
-{
-	int ret = __ffs(e->type);
-
-	EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX);
-
-	return ret;
-}
-
-static inline size_t extent_entry_bytes(const union bch_extent_entry *entry)
-{
-	switch (extent_entry_type(entry)) {
-#define x(f, n)						\
-	case BCH_EXTENT_ENTRY_##f:			\
-		return sizeof(struct bch_extent_##f);
-	BCH_EXTENT_ENTRY_TYPES()
-#undef x
-	default:
-		BUG();
-	}
-}
-
-static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
-{
-	return extent_entry_bytes(entry) / sizeof(u64);
-}
-
-static inline void __extent_entry_insert(struct bkey_i *k,
-					 union bch_extent_entry *dst,
-					 union bch_extent_entry *new)
-{
-	union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
-
-	memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new),
-			      dst, (u64 *) end - (u64 *) dst);
-	k->k.u64s += extent_entry_u64s(new);
-	memcpy_u64s_small(dst, new, extent_entry_u64s(new));
-}
-
-static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
-{
-	union bch_extent_entry *next = extent_entry_next(entry);
-
-	/* stripes have ptrs, but their layout doesn't work with this code */
-	BUG_ON(k.k->type == KEY_TYPE_stripe);
-
-	memmove_u64s_down(entry, next,
-			  (u64 *) bkey_val_end(k) - (u64 *) next);
-	k.k->u64s -= (u64 *) next - (u64 *) entry;
-}
-
-static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
-{
-	return __extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
-}
-
-static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e)
-{
-	return __extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
-}
-
-static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
-{
-	switch (__extent_entry_type(e)) {
-	case BCH_EXTENT_ENTRY_crc32:
-	case BCH_EXTENT_ENTRY_crc64:
-	case BCH_EXTENT_ENTRY_crc128:
-		return true;
-	default:
-		return false;
-	}
-}
-
-union bch_extent_crc {
-	u8				type;
-	struct bch_extent_crc32		crc32;
-	struct bch_extent_crc64		crc64;
-	struct bch_extent_crc128	crc128;
-};
-
-#define __entry_to_crc(_entry)						\
-	__builtin_choose_expr(						\
-		type_is_exact(_entry, const union bch_extent_entry *),	\
-		(const union bch_extent_crc *) (_entry),		\
-		(union bch_extent_crc *) (_entry))
-
-#define entry_to_crc(_entry)						\
-({									\
-	EBUG_ON((_entry) && !extent_entry_is_crc(_entry));		\
-									\
-	__entry_to_crc(_entry);						\
-})
-
-static inline struct bch_extent_crc_unpacked
-bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
-{
-#define common_fields(_crc)						\
-		.csum_type		= _crc.csum_type,		\
-		.compression_type	= _crc.compression_type,	\
-		.compressed_size	= _crc._compressed_size + 1,	\
-		.uncompressed_size	= _crc._uncompressed_size + 1,	\
-		.offset			= _crc.offset,			\
-		.live_size		= k->size
-
-	if (!crc)
-		return (struct bch_extent_crc_unpacked) {
-			.compressed_size	= k->size,
-			.uncompressed_size	= k->size,
-			.live_size		= k->size,
-		};
-
-	switch (extent_entry_type(to_entry(crc))) {
-	case BCH_EXTENT_ENTRY_crc32: {
-		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
-			common_fields(crc->crc32),
-		};
-
-		*((__le32 *) &ret.csum.lo) = (__le32 __force) crc->crc32.csum;
-		return ret;
-	}
-	case BCH_EXTENT_ENTRY_crc64: {
-		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
-			common_fields(crc->crc64),
-			.nonce			= crc->crc64.nonce,
-			.csum.lo		= (__force __le64) crc->crc64.csum_lo,
-		};
-
-		*((__le16 *) &ret.csum.hi) = (__le16 __force) crc->crc64.csum_hi;
-
-		return ret;
-	}
-	case BCH_EXTENT_ENTRY_crc128: {
-		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
-			common_fields(crc->crc128),
-			.nonce			= crc->crc128.nonce,
-			.csum			= crc->crc128.csum,
-		};
-
-		return ret;
-	}
-	default:
-		BUG();
-	}
-#undef common_fields
-}
-
-static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc)
-{
-	return (crc.compression_type != BCH_COMPRESSION_TYPE_none &&
-		crc.compression_type != BCH_COMPRESSION_TYPE_incompressible);
-}
-
-static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc)
-{
-	return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc);
-}
-
-/* bkey_ptrs: generically over any key type that has ptrs */
-
-struct bkey_ptrs_c {
-	const union bch_extent_entry	*start;
-	const union bch_extent_entry	*end;
-};
-
-struct bkey_ptrs {
-	union bch_extent_entry	*start;
-	union bch_extent_entry	*end;
-};
-
-static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
-{
-	switch (k.k->type) {
-	case KEY_TYPE_btree_ptr: {
-		struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
-
-		return (struct bkey_ptrs_c) {
-			to_entry(&e.v->start[0]),
-			to_entry(extent_entry_last(e))
-		};
-	}
-	case KEY_TYPE_extent: {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-
-		return (struct bkey_ptrs_c) {
-			e.v->start,
-			extent_entry_last(e)
-		};
-	}
-	case KEY_TYPE_stripe: {
-		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-
-		return (struct bkey_ptrs_c) {
-			to_entry(&s.v->ptrs[0]),
-			to_entry(&s.v->ptrs[s.v->nr_blocks]),
-		};
-	}
-	case KEY_TYPE_reflink_v: {
-		struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
-
-		return (struct bkey_ptrs_c) {
-			r.v->start,
-			bkey_val_end(r),
-		};
-	}
-	case KEY_TYPE_btree_ptr_v2: {
-		struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k);
-
-		return (struct bkey_ptrs_c) {
-			to_entry(&e.v->start[0]),
-			to_entry(extent_entry_last(e))
-		};
-	}
-	default:
-		return (struct bkey_ptrs_c) { NULL, NULL };
-	}
-}
-
-static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
-{
-	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c);
-
-	return (struct bkey_ptrs) {
-		(void *) p.start,
-		(void *) p.end
-	};
-}
-
-#define __bkey_extent_entry_for_each_from(_start, _end, _entry)		\
-	for ((_entry) = (_start);					\
-	     (_entry) < (_end);						\
-	     (_entry) = extent_entry_next_safe(_entry, _end))
-
-#define __bkey_ptr_next(_ptr, _end)					\
-({									\
-	typeof(_end) _entry;						\
-									\
-	__bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry)	\
-		if (extent_entry_is_ptr(_entry))			\
-			break;						\
-									\
-	_entry < (_end) ? entry_to_ptr(_entry) : NULL;			\
-})
-
-#define bkey_extent_entry_for_each_from(_p, _entry, _start)		\
-	__bkey_extent_entry_for_each_from(_start, (_p).end, _entry)
-
-#define bkey_extent_entry_for_each(_p, _entry)				\
-	bkey_extent_entry_for_each_from(_p, _entry, _p.start)
-
-#define __bkey_for_each_ptr(_start, _end, _ptr)				\
-	for (typeof(_start) (_ptr) = (_start);				\
-	     ((_ptr) = __bkey_ptr_next(_ptr, _end));			\
-	     (_ptr)++)
-
-#define bkey_ptr_next(_p, _ptr)						\
-	__bkey_ptr_next(_ptr, (_p).end)
-
-#define bkey_for_each_ptr(_p, _ptr)					\
-	__bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr)
-
-#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry)			\
-({									\
-	__label__ out;							\
-									\
-	(_ptr).idx	= 0;						\
-	(_ptr).has_ec	= false;					\
-									\
-	__bkey_extent_entry_for_each_from(_entry, _end, _entry)		\
-		switch (__extent_entry_type(_entry)) {			\
-		case BCH_EXTENT_ENTRY_ptr:				\
-			(_ptr).ptr		= _entry->ptr;		\
-			goto out;					\
-		case BCH_EXTENT_ENTRY_crc32:				\
-		case BCH_EXTENT_ENTRY_crc64:				\
-		case BCH_EXTENT_ENTRY_crc128:				\
-			(_ptr).crc = bch2_extent_crc_unpack(_k,		\
-					entry_to_crc(_entry));		\
-			break;						\
-		case BCH_EXTENT_ENTRY_stripe_ptr:			\
-			(_ptr).ec = _entry->stripe_ptr;			\
-			(_ptr).has_ec	= true;				\
-			break;						\
-		default:						\
-			/* nothing */					\
-			break;						\
-		}							\
-out:									\
-	_entry < (_end);						\
-})
-
-#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry)	\
-	for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL),		\
-	     (_entry) = _start;						\
-	     __bkey_ptr_next_decode(_k, _end, _ptr, _entry);		\
-	     (_entry) = extent_entry_next_safe(_entry, _end))
-
-#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry)			\
-	__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end,		\
-				   _ptr, _entry)
-
-#define bkey_crc_next(_k, _start, _end, _crc, _iter)			\
-({									\
-	__bkey_extent_entry_for_each_from(_iter, _end, _iter)		\
-		if (extent_entry_is_crc(_iter)) {			\
-			(_crc) = bch2_extent_crc_unpack(_k,		\
-						entry_to_crc(_iter));	\
-			break;						\
-		}							\
-									\
-	(_iter) < (_end);						\
-})
-
-#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter)		\
-	for ((_crc) = bch2_extent_crc_unpack(_k, NULL),			\
-	     (_iter) = (_start);					\
-	     bkey_crc_next(_k, _start, _end, _crc, _iter);		\
-	     (_iter) = extent_entry_next(_iter))
-
-#define bkey_for_each_crc(_k, _p, _crc, _iter)				\
-	__bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter)
-
-/* Iterate over pointers in KEY_TYPE_extent: */
-
-#define extent_for_each_entry_from(_e, _entry, _start)			\
-	__bkey_extent_entry_for_each_from(_start,			\
-				extent_entry_last(_e), _entry)
-
-#define extent_for_each_entry(_e, _entry)				\
-	extent_for_each_entry_from(_e, _entry, (_e).v->start)
-
-#define extent_ptr_next(_e, _ptr)					\
-	__bkey_ptr_next(_ptr, extent_entry_last(_e))
-
-#define extent_for_each_ptr(_e, _ptr)					\
-	__bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr)
-
-#define extent_for_each_ptr_decode(_e, _ptr, _entry)			\
-	__bkey_for_each_ptr_decode((_e).k, (_e).v->start,		\
-				   extent_entry_last(_e), _ptr, _entry)
-
-/* utility code common to all keys with pointers: */
-
-void bch2_mark_io_failure(struct bch_io_failures *,
-			  struct extent_ptr_decoded *);
-int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
-			       struct bch_io_failures *,
-			       struct extent_ptr_decoded *);
-
-/* KEY_TYPE_btree_ptr: */
-
-int bch2_btree_ptr_invalid(struct bch_fs *, struct bkey_s_c,
-			   enum bch_validate_flags, struct printbuf *);
-void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
-			    struct bkey_s_c);
-
-int bch2_btree_ptr_v2_invalid(struct bch_fs *, struct bkey_s_c,
-			      enum bch_validate_flags, struct printbuf *);
-void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
-			      int, struct bkey_s);
-
-#define bch2_bkey_ops_btree_ptr ((struct bkey_ops) {		\
-	.key_invalid	= bch2_btree_ptr_invalid,		\
-	.val_to_text	= bch2_btree_ptr_to_text,		\
-	.swab		= bch2_ptr_swab,			\
-	.trigger	= bch2_trigger_extent,			\
-})
-
-#define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) {		\
-	.key_invalid	= bch2_btree_ptr_v2_invalid,		\
-	.val_to_text	= bch2_btree_ptr_v2_to_text,		\
-	.swab		= bch2_ptr_swab,			\
-	.compat		= bch2_btree_ptr_v2_compat,		\
-	.trigger	= bch2_trigger_extent,			\
-	.min_val_size	= 40,					\
-})
-
-/* KEY_TYPE_extent: */
-
-bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-
-#define bch2_bkey_ops_extent ((struct bkey_ops) {		\
-	.key_invalid	= bch2_bkey_ptrs_invalid,		\
-	.val_to_text	= bch2_bkey_ptrs_to_text,		\
-	.swab		= bch2_ptr_swab,			\
-	.key_normalize	= bch2_extent_normalize,		\
-	.key_merge	= bch2_extent_merge,			\
-	.trigger	= bch2_trigger_extent,			\
-})
-
-/* KEY_TYPE_reservation: */
-
-int bch2_reservation_invalid(struct bch_fs *, struct bkey_s_c,
-			     enum bch_validate_flags, struct printbuf *);
-void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-
-#define bch2_bkey_ops_reservation ((struct bkey_ops) {		\
-	.key_invalid	= bch2_reservation_invalid,		\
-	.val_to_text	= bch2_reservation_to_text,		\
-	.key_merge	= bch2_reservation_merge,		\
-	.trigger	= bch2_trigger_reservation,		\
-	.min_val_size	= 8,					\
-})
-
-/* Extent checksum entries: */
-
-bool bch2_can_narrow_extent_crcs(struct bkey_s_c,
-				 struct bch_extent_crc_unpacked);
-bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked);
-void bch2_extent_crc_append(struct bkey_i *,
-			    struct bch_extent_crc_unpacked);
-
-/* Generic code for keys with pointers: */
-
-static inline bool bkey_is_btree_ptr(const struct bkey *k)
-{
-	switch (k->type) {
-	case KEY_TYPE_btree_ptr:
-	case KEY_TYPE_btree_ptr_v2:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool bkey_extent_is_direct_data(const struct bkey *k)
-{
-	switch (k->type) {
-	case KEY_TYPE_btree_ptr:
-	case KEY_TYPE_btree_ptr_v2:
-	case KEY_TYPE_extent:
-	case KEY_TYPE_reflink_v:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool bkey_extent_is_inline_data(const struct bkey *k)
-{
-	return  k->type == KEY_TYPE_inline_data ||
-		k->type == KEY_TYPE_indirect_inline_data;
-}
-
-static inline unsigned bkey_inline_data_offset(const struct bkey *k)
-{
-	switch (k->type) {
-	case KEY_TYPE_inline_data:
-		return sizeof(struct bch_inline_data);
-	case KEY_TYPE_indirect_inline_data:
-		return sizeof(struct bch_indirect_inline_data);
-	default:
-		BUG();
-	}
-}
-
-static inline unsigned bkey_inline_data_bytes(const struct bkey *k)
-{
-	return bkey_val_bytes(k) - bkey_inline_data_offset(k);
-}
-
-#define bkey_inline_data_p(_k)	(((void *) (_k).v) + bkey_inline_data_offset((_k).k))
-
-static inline bool bkey_extent_is_data(const struct bkey *k)
-{
-	return  bkey_extent_is_direct_data(k) ||
-		bkey_extent_is_inline_data(k) ||
-		k->type == KEY_TYPE_reflink_p;
-}
-
-/*
- * Should extent be counted under inode->i_sectors?
- */
-static inline bool bkey_extent_is_allocation(const struct bkey *k)
-{
-	switch (k->type) {
-	case KEY_TYPE_extent:
-	case KEY_TYPE_reservation:
-	case KEY_TYPE_reflink_p:
-	case KEY_TYPE_reflink_v:
-	case KEY_TYPE_inline_data:
-	case KEY_TYPE_indirect_inline_data:
-	case KEY_TYPE_error:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool bkey_extent_is_unwritten(struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-	bkey_for_each_ptr(ptrs, ptr)
-		if (ptr->unwritten)
-			return true;
-	return false;
-}
-
-static inline bool bkey_extent_is_reservation(struct bkey_s_c k)
-{
-	return k.k->type == KEY_TYPE_reservation ||
-		bkey_extent_is_unwritten(k);
-}
-
-static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
-{
-	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
-	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-
-	bkey_for_each_ptr(p, ptr)
-		ret.data[ret.nr++] = ptr->dev;
-
-	return ret;
-}
-
-static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
-{
-	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
-	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-
-	bkey_for_each_ptr(p, ptr)
-		if (!ptr->cached)
-			ret.data[ret.nr++] = ptr->dev;
-
-	return ret;
-}
-
-static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
-{
-	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
-	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-
-	bkey_for_each_ptr(p, ptr)
-		if (ptr->cached)
-			ret.data[ret.nr++] = ptr->dev;
-
-	return ret;
-}
-
-unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
-unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
-unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
-bool bch2_bkey_is_incompressible(struct bkey_s_c);
-unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
-
-unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
-unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_decoded *);
-unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *);
-unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
-
-void bch2_bkey_drop_device(struct bkey_s, unsigned);
-void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
-
-const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned);
-
-static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev)
-{
-	return (void *) bch2_bkey_has_device_c(k.s_c, dev);
-}
-
-bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
-
-void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
-
-static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr)
-{
-	struct bch_extent_ptr *dest;
-
-	EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev));
-
-	switch (k->k.type) {
-	case KEY_TYPE_btree_ptr:
-	case KEY_TYPE_btree_ptr_v2:
-	case KEY_TYPE_extent:
-		EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
-
-		ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
-		dest = (struct bch_extent_ptr *)((void *) &k->v + bkey_val_bytes(&k->k));
-		*dest = ptr;
-		k->k.u64s++;
-		break;
-	default:
-		BUG();
-	}
-}
-
-void bch2_extent_ptr_decoded_append(struct bkey_i *,
-				    struct extent_ptr_decoded *);
-union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s,
-						   struct bch_extent_ptr *);
-union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
-					   struct bch_extent_ptr *);
-
-#define bch2_bkey_drop_ptrs(_k, _ptr, _cond)				\
-do {									\
-	struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k);			\
-									\
-	struct bch_extent_ptr *_ptr = &_ptrs.start->ptr;		\
-									\
-	while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) {			\
-		if (_cond) {						\
-			_ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr);	\
-			_ptrs = bch2_bkey_ptrs(_k);			\
-			continue;					\
-		}							\
-									\
-		(_ptr)++;						\
-	}								\
-} while (0)
-
-bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
-			   struct bch_extent_ptr, u64);
-bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
-struct bch_extent_ptr *
-bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
-
-void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *);
-
-bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
-void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *);
-void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
-			    struct bkey_s_c);
-int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c,
-			   enum bch_validate_flags, struct printbuf *);
-
-void bch2_ptr_swab(struct bkey_s);
-
-const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
-unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
-				       unsigned, unsigned);
-bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
-
-int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
-				  struct bch_io_opts *);
-
-/* Generic extent code: */
-
-enum bch_extent_overlap {
-	BCH_EXTENT_OVERLAP_ALL		= 0,
-	BCH_EXTENT_OVERLAP_BACK		= 1,
-	BCH_EXTENT_OVERLAP_FRONT	= 2,
-	BCH_EXTENT_OVERLAP_MIDDLE	= 3,
-};
-
-/* Returns how k overlaps with m */
-static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
-							  const struct bkey *m)
-{
-	int cmp1 = bkey_lt(k->p, m->p);
-	int cmp2 = bkey_gt(bkey_start_pos(k), bkey_start_pos(m));
-
-	return (cmp1 << 1) + cmp2;
-}
-
-int bch2_cut_front_s(struct bpos, struct bkey_s);
-int bch2_cut_back_s(struct bpos, struct bkey_s);
-
-static inline void bch2_cut_front(struct bpos where, struct bkey_i *k)
-{
-	bch2_cut_front_s(where, bkey_i_to_s(k));
-}
-
-static inline void bch2_cut_back(struct bpos where, struct bkey_i *k)
-{
-	bch2_cut_back_s(where, bkey_i_to_s(k));
-}
-
-/**
- * bch_key_resize - adjust size of @k
- *
- * bkey_start_offset(k) will be preserved, modifies where the extent ends
- */
-static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
-{
-	k->p.offset -= k->size;
-	k->p.offset += new_size;
-	k->size = new_size;
-}
-
-#endif /* _BCACHEFS_EXTENTS_H */
diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h
deleted file mode 100644
index 3bd2fdbb0817..000000000000
--- a/fs/bcachefs/extents_format.h
+++ /dev/null
@@ -1,295 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EXTENTS_FORMAT_H
-#define _BCACHEFS_EXTENTS_FORMAT_H
-
-/*
- * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
- * preceded by checksum/compression information (bch_extent_crc32 or
- * bch_extent_crc64).
- *
- * One major determining factor in the format of extents is how we handle and
- * represent extents that have been partially overwritten and thus trimmed:
- *
- * If an extent is not checksummed or compressed, when the extent is trimmed we
- * don't have to remember the extent we originally allocated and wrote: we can
- * merely adjust ptr->offset to point to the start of the data that is currently
- * live. The size field in struct bkey records the current (live) size of the
- * extent, and is also used to mean "size of region on disk that we point to" in
- * this case.
- *
- * Thus an extent that is not checksummed or compressed will consist only of a
- * list of bch_extent_ptrs, with none of the fields in
- * bch_extent_crc32/bch_extent_crc64.
- *
- * When an extent is checksummed or compressed, it's not possible to read only
- * the data that is currently live: we have to read the entire extent that was
- * originally written, and then return only the part of the extent that is
- * currently live.
- *
- * Thus, in addition to the current size of the extent in struct bkey, we need
- * to store the size of the originally allocated space - this is the
- * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
- * when the extent is trimmed, instead of modifying the offset field of the
- * pointer, we keep a second smaller offset field - "offset into the original
- * extent of the currently live region".
- *
- * The other major determining factor is replication and data migration:
- *
- * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
- * write, we will initially write all the replicas in the same format, with the
- * same checksum type and compression format - however, when copygc runs later (or
- * tiering/cache promotion, anything that moves data), it is not in general
- * going to rewrite all the pointers at once - one of the replicas may be in a
- * bucket on one device that has very little fragmentation while another lives
- * in a bucket that has become heavily fragmented, and thus is being rewritten
- * sooner than the rest.
- *
- * Thus it will only move a subset of the pointers (or in the case of
- * tiering/cache promotion perhaps add a single pointer without dropping any
- * current pointers), and if the extent has been partially overwritten it must
- * write only the currently live portion (or copygc would not be able to reduce
- * fragmentation!) - which necessitates a different bch_extent_crc format for
- * the new pointer.
- *
- * But in the interests of space efficiency, we don't want to store one
- * bch_extent_crc for each pointer if we don't have to.
- *
- * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
- * bch_extent_ptrs appended arbitrarily one after the other. We determine the
- * type of a given entry with a scheme similar to utf8 (except we're encoding a
- * type, not a size), encoding the type in the position of the first set bit:
- *
- * bch_extent_crc32	- 0b1
- * bch_extent_ptr	- 0b10
- * bch_extent_crc64	- 0b100
- *
- * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
- * bch_extent_crc64 is the least constrained).
- *
- * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
- * until the next bch_extent_crc32/64.
- *
- * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
- * is neither checksummed nor compressed.
- */
-
-#define BCH_EXTENT_ENTRY_TYPES()		\
-	x(ptr,			0)		\
-	x(crc32,		1)		\
-	x(crc64,		2)		\
-	x(crc128,		3)		\
-	x(stripe_ptr,		4)		\
-	x(rebalance,		5)
-#define BCH_EXTENT_ENTRY_MAX	6
-
-enum bch_extent_entry_type {
-#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
-	BCH_EXTENT_ENTRY_TYPES()
-#undef x
-};
-
-/* Compressed/uncompressed size are stored biased by 1: */
-struct bch_extent_crc32 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u32			type:2,
-				_compressed_size:7,
-				_uncompressed_size:7,
-				offset:7,
-				_unused:1,
-				csum_type:4,
-				compression_type:4;
-	__u32			csum;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u32			csum;
-	__u32			compression_type:4,
-				csum_type:4,
-				_unused:1,
-				offset:7,
-				_uncompressed_size:7,
-				_compressed_size:7,
-				type:2;
-#endif
-} __packed __aligned(8);
-
-#define CRC32_SIZE_MAX		(1U << 7)
-#define CRC32_NONCE_MAX		0
-
-struct bch_extent_crc64 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u64			type:3,
-				_compressed_size:9,
-				_uncompressed_size:9,
-				offset:9,
-				nonce:10,
-				csum_type:4,
-				compression_type:4,
-				csum_hi:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			csum_hi:16,
-				compression_type:4,
-				csum_type:4,
-				nonce:10,
-				offset:9,
-				_uncompressed_size:9,
-				_compressed_size:9,
-				type:3;
-#endif
-	__u64			csum_lo;
-} __packed __aligned(8);
-
-#define CRC64_SIZE_MAX		(1U << 9)
-#define CRC64_NONCE_MAX		((1U << 10) - 1)
-
-struct bch_extent_crc128 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u64			type:4,
-				_compressed_size:13,
-				_uncompressed_size:13,
-				offset:13,
-				nonce:13,
-				csum_type:4,
-				compression_type:4;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			compression_type:4,
-				csum_type:4,
-				nonce:13,
-				offset:13,
-				_uncompressed_size:13,
-				_compressed_size:13,
-				type:4;
-#endif
-	struct bch_csum		csum;
-} __packed __aligned(8);
-
-#define CRC128_SIZE_MAX		(1U << 13)
-#define CRC128_NONCE_MAX	((1U << 13) - 1)
-
-/*
- * @reservation - pointer hasn't been written to, just reserved
- */
-struct bch_extent_ptr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u64			type:1,
-				cached:1,
-				unused:1,
-				unwritten:1,
-				offset:44, /* 8 petabytes */
-				dev:8,
-				gen:8;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			gen:8,
-				dev:8,
-				offset:44,
-				unwritten:1,
-				unused:1,
-				cached:1,
-				type:1;
-#endif
-} __packed __aligned(8);
-
-struct bch_extent_stripe_ptr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u64			type:5,
-				block:8,
-				redundancy:4,
-				idx:47;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			idx:47,
-				redundancy:4,
-				block:8,
-				type:5;
-#endif
-};
-
-struct bch_extent_rebalance {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u64			type:6,
-				unused:34,
-				compression:8, /* enum bch_compression_opt */
-				target:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			target:16,
-				compression:8,
-				unused:34,
-				type:6;
-#endif
-};
-
-union bch_extent_entry {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ||  __BITS_PER_LONG == 64
-	unsigned long			type;
-#elif __BITS_PER_LONG == 32
-	struct {
-		unsigned long		pad;
-		unsigned long		type;
-	};
-#else
-#error edit for your odd byteorder.
-#endif
-
-#define x(f, n) struct bch_extent_##f	f;
-	BCH_EXTENT_ENTRY_TYPES()
-#undef x
-};
-
-struct bch_btree_ptr {
-	struct bch_val		v;
-
-	__u64			_data[0];
-	struct bch_extent_ptr	start[];
-} __packed __aligned(8);
-
-struct bch_btree_ptr_v2 {
-	struct bch_val		v;
-
-	__u64			mem_ptr;
-	__le64			seq;
-	__le16			sectors_written;
-	__le16			flags;
-	struct bpos		min_key;
-	__u64			_data[0];
-	struct bch_extent_ptr	start[];
-} __packed __aligned(8);
-
-LE16_BITMASK(BTREE_PTR_RANGE_UPDATED,	struct bch_btree_ptr_v2, flags, 0, 1);
-
-struct bch_extent {
-	struct bch_val		v;
-
-	__u64			_data[0];
-	union bch_extent_entry	start[];
-} __packed __aligned(8);
-
-/* Maximum size (in u64s) a single pointer could be: */
-#define BKEY_EXTENT_PTR_U64s_MAX\
-	((sizeof(struct bch_extent_crc128) +			\
-	  sizeof(struct bch_extent_ptr)) / sizeof(__u64))
-
-/* Maximum possible size of an entire extent value: */
-#define BKEY_EXTENT_VAL_U64s_MAX				\
-	(1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
-
-/* * Maximum possible size of an entire extent, key + value: */
-#define BKEY_EXTENT_U64s_MAX		(BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
-
-/* Btree pointers don't carry around checksums: */
-#define BKEY_BTREE_PTR_VAL_U64s_MAX				\
-	((sizeof(struct bch_btree_ptr_v2) +			\
-	  sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
-#define BKEY_BTREE_PTR_U64s_MAX					\
-	(BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
-
-struct bch_reservation {
-	struct bch_val		v;
-
-	__le32			generation;
-	__u8			nr_replicas;
-	__u8			pad[3];
-} __packed __aligned(8);
-
-struct bch_inline_data {
-	struct bch_val		v;
-	u8			data[];
-};
-
-#endif /* _BCACHEFS_EXTENTS_FORMAT_H */
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
deleted file mode 100644
index 43d6c341ecca..000000000000
--- a/fs/bcachefs/extents_types.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EXTENTS_TYPES_H
-#define _BCACHEFS_EXTENTS_TYPES_H
-
-#include "bcachefs_format.h"
-
-struct bch_extent_crc_unpacked {
-	u32			compressed_size;
-	u32			uncompressed_size;
-	u32			live_size;
-
-	u8			csum_type;
-	u8			compression_type;
-
-	u16			offset;
-
-	u16			nonce;
-
-	struct bch_csum		csum;
-};
-
-struct extent_ptr_decoded {
-	unsigned			idx;
-	bool				has_ec;
-	struct bch_extent_crc_unpacked	crc;
-	struct bch_extent_ptr		ptr;
-	struct bch_extent_stripe_ptr	ec;
-};
-
-struct bch_io_failures {
-	u8			nr;
-	struct bch_dev_io_failures {
-		u8		dev;
-		u8		idx;
-		u8		nr_failed;
-		u8		nr_retries;
-	}			devs[BCH_REPLICAS_MAX];
-};
-
-#endif /* _BCACHEFS_EXTENTS_TYPES_H */
diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c
deleted file mode 100644
index 2eaffe37b5e7..000000000000
--- a/fs/bcachefs/eytzinger.c
+++ /dev/null
@@ -1,305 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "eytzinger.h"
-
-/**
- * is_aligned - is this pointer & size okay for word-wide copying?
- * @base: pointer to data
- * @size: size of each element
- * @align: required alignment (typically 4 or 8)
- *
- * Returns true if elements can be copied using word loads and stores.
- * The size must be a multiple of the alignment, and the base address must
- * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
- *
- * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
- * to "if ((a | b) & mask)", so we do that by hand.
- */
-__attribute_const__ __always_inline
-static bool is_aligned(const void *base, size_t size, unsigned char align)
-{
-	unsigned char lsbits = (unsigned char)size;
-
-	(void)base;
-#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-	lsbits |= (unsigned char)(uintptr_t)base;
-#endif
-	return (lsbits & (align - 1)) == 0;
-}
-
-/**
- * swap_words_32 - swap two elements in 32-bit chunks
- * @a: pointer to the first element to swap
- * @b: pointer to the second element to swap
- * @n: element size (must be a multiple of 4)
- *
- * Exchange the two objects in memory.  This exploits base+index addressing,
- * which basically all CPUs have, to minimize loop overhead computations.
- *
- * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
- * bottom of the loop, even though the zero flag is still valid from the
- * subtract (since the intervening mov instructions don't alter the flags).
- * Gcc 8.1.0 doesn't have that problem.
- */
-static void swap_words_32(void *a, void *b, size_t n)
-{
-	do {
-		u32 t = *(u32 *)(a + (n -= 4));
-		*(u32 *)(a + n) = *(u32 *)(b + n);
-		*(u32 *)(b + n) = t;
-	} while (n);
-}
-
-/**
- * swap_words_64 - swap two elements in 64-bit chunks
- * @a: pointer to the first element to swap
- * @b: pointer to the second element to swap
- * @n: element size (must be a multiple of 8)
- *
- * Exchange the two objects in memory.  This exploits base+index
- * addressing, which basically all CPUs have, to minimize loop overhead
- * computations.
- *
- * We'd like to use 64-bit loads if possible.  If they're not, emulating
- * one requires base+index+4 addressing which x86 has but most other
- * processors do not.  If CONFIG_64BIT, we definitely have 64-bit loads,
- * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
- * x32 ABI).  Are there any cases the kernel needs to worry about?
- */
-static void swap_words_64(void *a, void *b, size_t n)
-{
-	do {
-#ifdef CONFIG_64BIT
-		u64 t = *(u64 *)(a + (n -= 8));
-		*(u64 *)(a + n) = *(u64 *)(b + n);
-		*(u64 *)(b + n) = t;
-#else
-		/* Use two 32-bit transfers to avoid base+index+4 addressing */
-		u32 t = *(u32 *)(a + (n -= 4));
-		*(u32 *)(a + n) = *(u32 *)(b + n);
-		*(u32 *)(b + n) = t;
-
-		t = *(u32 *)(a + (n -= 4));
-		*(u32 *)(a + n) = *(u32 *)(b + n);
-		*(u32 *)(b + n) = t;
-#endif
-	} while (n);
-}
-
-/**
- * swap_bytes - swap two elements a byte at a time
- * @a: pointer to the first element to swap
- * @b: pointer to the second element to swap
- * @n: element size
- *
- * This is the fallback if alignment doesn't allow using larger chunks.
- */
-static void swap_bytes(void *a, void *b, size_t n)
-{
-	do {
-		char t = ((char *)a)[--n];
-		((char *)a)[n] = ((char *)b)[n];
-		((char *)b)[n] = t;
-	} while (n);
-}
-
-/*
- * The values are arbitrary as long as they can't be confused with
- * a pointer, but small integers make for the smallest compare
- * instructions.
- */
-#define SWAP_WORDS_64 (swap_r_func_t)0
-#define SWAP_WORDS_32 (swap_r_func_t)1
-#define SWAP_BYTES    (swap_r_func_t)2
-#define SWAP_WRAPPER  (swap_r_func_t)3
-
-struct wrapper {
-	cmp_func_t cmp;
-	swap_func_t swap_func;
-};
-
-/*
- * The function pointer is last to make tail calls most efficient if the
- * compiler decides not to inline this function.
- */
-static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv)
-{
-	if (swap_func == SWAP_WRAPPER) {
-		((const struct wrapper *)priv)->swap_func(a, b, (int)size);
-		return;
-	}
-
-	if (swap_func == SWAP_WORDS_64)
-		swap_words_64(a, b, size);
-	else if (swap_func == SWAP_WORDS_32)
-		swap_words_32(a, b, size);
-	else if (swap_func == SWAP_BYTES)
-		swap_bytes(a, b, size);
-	else
-		swap_func(a, b, (int)size, priv);
-}
-
-#define _CMP_WRAPPER ((cmp_r_func_t)0L)
-
-static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv)
-{
-	if (cmp == _CMP_WRAPPER)
-		return ((const struct wrapper *)priv)->cmp(a, b);
-	return cmp(a, b, priv);
-}
-
-static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size,
-			 cmp_r_func_t cmp_func, const void *priv,
-			 size_t l, size_t r)
-{
-	return do_cmp(base + inorder_to_eytzinger0(l, n) * size,
-		      base + inorder_to_eytzinger0(r, n) * size,
-		      cmp_func, priv);
-}
-
-static inline void eytzinger0_do_swap(void *base, size_t n, size_t size,
-			   swap_r_func_t swap_func, const void *priv,
-			   size_t l, size_t r)
-{
-	do_swap(base + inorder_to_eytzinger0(l, n) * size,
-		base + inorder_to_eytzinger0(r, n) * size,
-		size, swap_func, priv);
-}
-
-void eytzinger0_sort_r(void *base, size_t n, size_t size,
-		       cmp_r_func_t cmp_func,
-		       swap_r_func_t swap_func,
-		       const void *priv)
-{
-	int i, j, k;
-
-	/* called from 'sort' without swap function, let's pick the default */
-	if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func)
-		swap_func = NULL;
-
-	if (!swap_func) {
-		if (is_aligned(base, size, 8))
-			swap_func = SWAP_WORDS_64;
-		else if (is_aligned(base, size, 4))
-			swap_func = SWAP_WORDS_32;
-		else
-			swap_func = SWAP_BYTES;
-	}
-
-	/* heapify */
-	for (i = n / 2 - 1; i >= 0; --i) {
-		/* Find the sift-down path all the way to the leaves. */
-		for (j = i; k = j * 2 + 1, k + 1 < n;)
-			j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
-
-		/* Special case for the last leaf with no sibling. */
-		if (j * 2 + 2 == n)
-			j = j * 2 + 1;
-
-		/* Backtrack to the correct location. */
-		while (j != i && eytzinger0_do_cmp(base, n, size, cmp_func, priv, i, j) >= 0)
-			j = (j - 1) / 2;
-
-		/* Shift the element into its correct place. */
-		for (k = j; j != i;) {
-			j = (j - 1) / 2;
-			eytzinger0_do_swap(base, n, size, swap_func, priv, j, k);
-		}
-	}
-
-	/* sort */
-	for (i = n - 1; i > 0; --i) {
-		eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i);
-
-		/* Find the sift-down path all the way to the leaves. */
-		for (j = 0; k = j * 2 + 1, k + 1 < i;)
-			j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
-
-		/* Special case for the last leaf with no sibling. */
-		if (j * 2 + 2 == i)
-			j = j * 2 + 1;
-
-		/* Backtrack to the correct location. */
-		while (j && eytzinger0_do_cmp(base, n, size, cmp_func, priv, 0, j) >= 0)
-			j = (j - 1) / 2;
-
-		/* Shift the element into its correct place. */
-		for (k = j; j;) {
-			j = (j - 1) / 2;
-			eytzinger0_do_swap(base, n, size, swap_func, priv, j, k);
-		}
-	}
-}
-
-void eytzinger0_sort(void *base, size_t n, size_t size,
-		     cmp_func_t cmp_func,
-		     swap_func_t swap_func)
-{
-	struct wrapper w = {
-		.cmp  = cmp_func,
-		.swap_func = swap_func,
-	};
-
-	return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
-}
-
-#if 0
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/ktime.h>
-
-static u64 cmp_count;
-
-static int mycmp(const void *a, const void *b)
-{
-	u32 _a = *(u32 *)a;
-	u32 _b = *(u32 *)b;
-
-	cmp_count++;
-	if (_a < _b)
-		return -1;
-	else if (_a > _b)
-		return 1;
-	else
-		return 0;
-}
-
-static int test(void)
-{
-	size_t N, i;
-	ktime_t start, end;
-	s64 delta;
-	u32 *arr;
-
-	for (N = 10000; N <= 100000; N += 10000) {
-		arr = kmalloc_array(N, sizeof(u32), GFP_KERNEL);
-		cmp_count = 0;
-
-		for (i = 0; i < N; i++)
-			arr[i] = get_random_u32();
-
-		start = ktime_get();
-		eytzinger0_sort(arr, N, sizeof(u32), mycmp, NULL);
-		end = ktime_get();
-
-		delta = ktime_us_delta(end, start);
-		printk(KERN_INFO "time: %lld\n", delta);
-		printk(KERN_INFO "comparisons: %lld\n", cmp_count);
-
-		u32 prev = 0;
-
-		eytzinger0_for_each(i, N) {
-			if (prev > arr[i])
-				goto err;
-			prev = arr[i];
-		}
-
-		kfree(arr);
-	}
-	return 0;
-
-err:
-	kfree(arr);
-	return -1;
-}
-#endif
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
deleted file mode 100644
index 24840aee335c..000000000000
--- a/fs/bcachefs/eytzinger.h
+++ /dev/null
@@ -1,306 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _EYTZINGER_H
-#define _EYTZINGER_H
-
-#include <linux/bitops.h>
-#include <linux/log2.h>
-
-#ifdef EYTZINGER_DEBUG
-#define EYTZINGER_BUG_ON(cond)		BUG_ON(cond)
-#else
-#define EYTZINGER_BUG_ON(cond)
-#endif
-
-/*
- * Traversal for trees in eytzinger layout - a full binary tree layed out in an
- * array.
- *
- * Consider using an eytzinger tree any time you would otherwise be doing binary
- * search over an array. Binary search is a worst case scenario for branch
- * prediction and prefetching, but in an eytzinger tree every node's children
- * are adjacent in memory, thus we can prefetch children before knowing the
- * result of the comparison, assuming multiple nodes fit on a cacheline.
- *
- * Two variants are provided, for one based indexing and zero based indexing.
- *
- * Zero based indexing is more convenient, but one based indexing has better
- * alignment and thus better performance because each new level of the tree
- * starts at a power of two, and thus if element 0 was cacheline aligned, each
- * new level will be as well.
- */
-
-static inline unsigned eytzinger1_child(unsigned i, unsigned child)
-{
-	EYTZINGER_BUG_ON(child > 1);
-
-	return (i << 1) + child;
-}
-
-static inline unsigned eytzinger1_left_child(unsigned i)
-{
-	return eytzinger1_child(i, 0);
-}
-
-static inline unsigned eytzinger1_right_child(unsigned i)
-{
-	return eytzinger1_child(i, 1);
-}
-
-static inline unsigned eytzinger1_first(unsigned size)
-{
-	return rounddown_pow_of_two(size);
-}
-
-static inline unsigned eytzinger1_last(unsigned size)
-{
-	return rounddown_pow_of_two(size + 1) - 1;
-}
-
-/*
- * eytzinger1_next() and eytzinger1_prev() have the nice properties that
- *
- * eytzinger1_next(0) == eytzinger1_first())
- * eytzinger1_prev(0) == eytzinger1_last())
- *
- * eytzinger1_prev(eytzinger1_first()) == 0
- * eytzinger1_next(eytzinger1_last()) == 0
- */
-
-static inline unsigned eytzinger1_next(unsigned i, unsigned size)
-{
-	EYTZINGER_BUG_ON(i > size);
-
-	if (eytzinger1_right_child(i) <= size) {
-		i = eytzinger1_right_child(i);
-
-		i <<= __fls(size + 1) - __fls(i);
-		i >>= i > size;
-	} else {
-		i >>= ffz(i) + 1;
-	}
-
-	return i;
-}
-
-static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
-{
-	EYTZINGER_BUG_ON(i > size);
-
-	if (eytzinger1_left_child(i) <= size) {
-		i = eytzinger1_left_child(i) + 1;
-
-		i <<= __fls(size + 1) - __fls(i);
-		i -= 1;
-		i >>= i > size;
-	} else {
-		i >>= __ffs(i) + 1;
-	}
-
-	return i;
-}
-
-static inline unsigned eytzinger1_extra(unsigned size)
-{
-	return (size + 1 - rounddown_pow_of_two(size)) << 1;
-}
-
-static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
-					      unsigned extra)
-{
-	unsigned b = __fls(i);
-	unsigned shift = __fls(size) - b;
-	int s;
-
-	EYTZINGER_BUG_ON(!i || i > size);
-
-	i  ^= 1U << b;
-	i <<= 1;
-	i  |= 1;
-	i <<= shift;
-
-	/*
-	 * sign bit trick:
-	 *
-	 * if (i > extra)
-	 *	i -= (i - extra) >> 1;
-	 */
-	s = extra - i;
-	i += (s >> 1) & (s >> 31);
-
-	return i;
-}
-
-static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
-					       unsigned extra)
-{
-	unsigned shift;
-	int s;
-
-	EYTZINGER_BUG_ON(!i || i > size);
-
-	/*
-	 * sign bit trick:
-	 *
-	 * if (i > extra)
-	 *	i += i - extra;
-	 */
-	s = extra - i;
-	i -= s & (s >> 31);
-
-	shift = __ffs(i);
-
-	i >>= shift + 1;
-	i  |= 1U << (__fls(size) - shift);
-
-	return i;
-}
-
-static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size)
-{
-	return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size));
-}
-
-static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
-{
-	return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size));
-}
-
-#define eytzinger1_for_each(_i, _size)			\
-	for (unsigned (_i) = eytzinger1_first((_size));	\
-	     (_i) != 0;					\
-	     (_i) = eytzinger1_next((_i), (_size)))
-
-/* Zero based indexing version: */
-
-static inline unsigned eytzinger0_child(unsigned i, unsigned child)
-{
-	EYTZINGER_BUG_ON(child > 1);
-
-	return (i << 1) + 1 + child;
-}
-
-static inline unsigned eytzinger0_left_child(unsigned i)
-{
-	return eytzinger0_child(i, 0);
-}
-
-static inline unsigned eytzinger0_right_child(unsigned i)
-{
-	return eytzinger0_child(i, 1);
-}
-
-static inline unsigned eytzinger0_first(unsigned size)
-{
-	return eytzinger1_first(size) - 1;
-}
-
-static inline unsigned eytzinger0_last(unsigned size)
-{
-	return eytzinger1_last(size) - 1;
-}
-
-static inline unsigned eytzinger0_next(unsigned i, unsigned size)
-{
-	return eytzinger1_next(i + 1, size) - 1;
-}
-
-static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
-{
-	return eytzinger1_prev(i + 1, size) - 1;
-}
-
-static inline unsigned eytzinger0_extra(unsigned size)
-{
-	return eytzinger1_extra(size);
-}
-
-static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
-					       unsigned extra)
-{
-	return __eytzinger1_to_inorder(i + 1, size, extra) - 1;
-}
-
-static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size,
-					       unsigned extra)
-{
-	return __inorder_to_eytzinger1(i + 1, size, extra) - 1;
-}
-
-static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size)
-{
-	return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size));
-}
-
-static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
-{
-	return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size));
-}
-
-#define eytzinger0_for_each(_i, _size)			\
-	for (unsigned (_i) = eytzinger0_first((_size));	\
-	     (_i) != -1;				\
-	     (_i) = eytzinger0_next((_i), (_size)))
-
-/* return greatest node <= @search, or -1 if not found */
-static inline int eytzinger0_find_le(void *base, size_t nr, size_t size,
-				     cmp_func_t cmp, const void *search)
-{
-	unsigned i, n = 0;
-
-	if (!nr)
-		return -1;
-
-	do {
-		i = n;
-		n = eytzinger0_child(i, cmp(base + i * size, search) <= 0);
-	} while (n < nr);
-
-	if (n & 1) {
-		/*
-		 * @i was greater than @search, return previous node:
-		 *
-		 * if @i was leftmost/smallest element,
-		 * eytzinger0_prev(eytzinger0_first())) returns -1, as expected
-		 */
-		return eytzinger0_prev(i, nr);
-	} else {
-		return i;
-	}
-}
-
-static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size,
-				     cmp_func_t cmp, const void *search)
-{
-	ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
-
-	/*
-	 * if eytitzinger0_find_le() returned -1 - no element was <= search - we
-	 * want to return the first element; next/prev identities mean this work
-	 * as expected
-	 *
-	 * similarly if find_le() returns last element, we should return -1;
-	 * identities mean this all works out:
-	 */
-	return eytzinger0_next(idx, nr);
-}
-
-#define eytzinger0_find(base, nr, size, _cmp, search)			\
-({									\
-	void *_base		= (base);				\
-	const void *_search	= (search);				\
-	size_t _nr		= (nr);					\
-	size_t _size		= (size);				\
-	size_t _i		= 0;					\
-	int _res;							\
-									\
-	while (_i < _nr &&						\
-	       (_res = _cmp(_search, _base + _i * _size)))		\
-		_i = eytzinger0_child(_i, _res > 0);			\
-	_i;								\
-})
-
-void eytzinger0_sort_r(void *, size_t, size_t,
-		       cmp_r_func_t, swap_r_func_t, const void *);
-void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t);
-
-#endif /* _EYTZINGER_H */
diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
deleted file mode 100644
index d8153fe27037..000000000000
--- a/fs/bcachefs/fifo.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FIFO_H
-#define _BCACHEFS_FIFO_H
-
-#include "util.h"
-
-#define FIFO(type)							\
-struct {								\
-	size_t front, back, size, mask;					\
-	type *data;							\
-}
-
-#define DECLARE_FIFO(type, name)	FIFO(type) name
-
-#define fifo_buf_size(fifo)						\
-	((fifo)->size							\
-	 ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0])	\
-	 : 0)
-
-#define init_fifo(fifo, _size, _gfp)					\
-({									\
-	(fifo)->front	= (fifo)->back = 0;				\
-	(fifo)->size	= (_size);					\
-	(fifo)->mask	= (fifo)->size					\
-		? roundup_pow_of_two((fifo)->size) - 1			\
-		: 0;							\
-	(fifo)->data	= kvmalloc(fifo_buf_size(fifo), (_gfp));	\
-})
-
-#define free_fifo(fifo)							\
-do {									\
-	kvfree((fifo)->data);						\
-	(fifo)->data = NULL;						\
-} while (0)
-
-#define fifo_swap(l, r)							\
-do {									\
-	swap((l)->front, (r)->front);					\
-	swap((l)->back, (r)->back);					\
-	swap((l)->size, (r)->size);					\
-	swap((l)->mask, (r)->mask);					\
-	swap((l)->data, (r)->data);					\
-} while (0)
-
-#define fifo_move(dest, src)						\
-do {									\
-	typeof(*((dest)->data)) _t;					\
-	while (!fifo_full(dest) &&					\
-	       fifo_pop(src, _t))					\
-		fifo_push(dest, _t);					\
-} while (0)
-
-#define fifo_used(fifo)		(((fifo)->back - (fifo)->front))
-#define fifo_free(fifo)		((fifo)->size - fifo_used(fifo))
-
-#define fifo_empty(fifo)	((fifo)->front == (fifo)->back)
-#define fifo_full(fifo)		(fifo_used(fifo) == (fifo)->size)
-
-#define fifo_peek_front(fifo)	((fifo)->data[(fifo)->front & (fifo)->mask])
-#define fifo_peek_back(fifo)	((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
-
-#define fifo_entry_idx_abs(fifo, p)					\
-	((((p) >= &fifo_peek_front(fifo)				\
-	   ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) +		\
-	   (((p) - (fifo)->data)))
-
-#define fifo_entry_idx(fifo, p)	(((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
-#define fifo_idx_entry(fifo, i)	((fifo)->data[((fifo)->front + (i)) & (fifo)->mask])
-
-#define fifo_push_back_ref(f)						\
-	(fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask])
-
-#define fifo_push_front_ref(f)						\
-	(fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask])
-
-#define fifo_push_back(fifo, new)					\
-({									\
-	typeof((fifo)->data) _r = fifo_push_back_ref(fifo);		\
-	if (_r)								\
-		*_r = (new);						\
-	_r != NULL;							\
-})
-
-#define fifo_push_front(fifo, new)					\
-({									\
-	typeof((fifo)->data) _r = fifo_push_front_ref(fifo);		\
-	if (_r)								\
-		*_r = (new);						\
-	_r != NULL;							\
-})
-
-#define fifo_pop_front(fifo, i)						\
-({									\
-	bool _r = !fifo_empty((fifo));					\
-	if (_r)								\
-		(i) = (fifo)->data[(fifo)->front++ & (fifo)->mask];	\
-	_r;								\
-})
-
-#define fifo_pop_back(fifo, i)						\
-({									\
-	bool _r = !fifo_empty((fifo));					\
-	if (_r)								\
-		(i) = (fifo)->data[--(fifo)->back & (fifo)->mask];	\
-	_r;								\
-})
-
-#define fifo_push_ref(fifo)	fifo_push_back_ref(fifo)
-#define fifo_push(fifo, i)	fifo_push_back(fifo, (i))
-#define fifo_pop(fifo, i)	fifo_pop_front(fifo, (i))
-#define fifo_peek(fifo)		fifo_peek_front(fifo)
-
-#define fifo_for_each_entry(_entry, _fifo, _iter)			\
-	for (typecheck(typeof((_fifo)->front), _iter),			\
-	     (_iter) = (_fifo)->front;					\
-	     ((_iter != (_fifo)->back) &&				\
-	      (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true));	\
-	     (_iter)++)
-
-#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter)			\
-	for (typecheck(typeof((_fifo)->front), _iter),			\
-	     (_iter) = (_fifo)->front;					\
-	     ((_iter != (_fifo)->back) &&				\
-	      (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true));	\
-	     (_iter)++)
-
-#endif /* _BCACHEFS_FIFO_H */
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
deleted file mode 100644
index 508d029ac53d..000000000000
--- a/fs/bcachefs/fs-common.c
+++ /dev/null
@@ -1,549 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "acl.h"
-#include "btree_update.h"
-#include "dirent.h"
-#include "fs-common.h"
-#include "inode.h"
-#include "subvolume.h"
-#include "xattr.h"
-
-#include <linux/posix_acl.h>
-
-static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode)
-{
-	return S_ISDIR(inode->bi_mode) && !inode->bi_subvol;
-}
-
-int bch2_create_trans(struct btree_trans *trans,
-		      subvol_inum dir,
-		      struct bch_inode_unpacked *dir_u,
-		      struct bch_inode_unpacked *new_inode,
-		      const struct qstr *name,
-		      uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
-		      struct posix_acl *default_acl,
-		      struct posix_acl *acl,
-		      subvol_inum snapshot_src,
-		      unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter dir_iter = { NULL };
-	struct btree_iter inode_iter = { NULL };
-	subvol_inum new_inum = dir;
-	u64 now = bch2_current_time(c);
-	u64 cpu = raw_smp_processor_id();
-	u64 dir_target;
-	u32 snapshot;
-	unsigned dir_type = mode_to_type(mode);
-	int ret;
-
-	ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
-	if (ret)
-		goto err;
-
-	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
-	if (ret)
-		goto err;
-
-	if (!(flags & BCH_CREATE_SNAPSHOT)) {
-		/* Normal create path - allocate a new inode: */
-		bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
-
-		if (flags & BCH_CREATE_TMPFILE)
-			new_inode->bi_flags |= BCH_INODE_unlinked;
-
-		ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
-		if (ret)
-			goto err;
-
-		snapshot_src = (subvol_inum) { 0 };
-	} else {
-		/*
-		 * Creating a snapshot - we're not allocating a new inode, but
-		 * we do have to lookup the root inode of the subvolume we're
-		 * snapshotting and update it (in the new snapshot):
-		 */
-
-		if (!snapshot_src.inum) {
-			/* Inode wasn't specified, just snapshot: */
-			struct bch_subvolume s;
-
-			ret = bch2_subvolume_get(trans, snapshot_src.subvol, true,
-						 BTREE_ITER_cached, &s);
-			if (ret)
-				goto err;
-
-			snapshot_src.inum = le64_to_cpu(s.inode);
-		}
-
-		ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src,
-				      BTREE_ITER_intent);
-		if (ret)
-			goto err;
-
-		if (new_inode->bi_subvol != snapshot_src.subvol) {
-			/* Not a subvolume root: */
-			ret = -EINVAL;
-			goto err;
-		}
-
-		/*
-		 * If we're not root, we have to own the subvolume being
-		 * snapshotted:
-		 */
-		if (uid && new_inode->bi_uid != uid) {
-			ret = -EPERM;
-			goto err;
-		}
-
-		flags |= BCH_CREATE_SUBVOL;
-	}
-
-	new_inum.inum	= new_inode->bi_inum;
-	dir_target	= new_inode->bi_inum;
-
-	if (flags & BCH_CREATE_SUBVOL) {
-		u32 new_subvol, dir_snapshot;
-
-		ret = bch2_subvolume_create(trans, new_inode->bi_inum,
-					    dir.subvol,
-					    snapshot_src.subvol,
-					    &new_subvol, &snapshot,
-					    (flags & BCH_CREATE_SNAPSHOT_RO) != 0);
-		if (ret)
-			goto err;
-
-		new_inode->bi_parent_subvol	= dir.subvol;
-		new_inode->bi_subvol		= new_subvol;
-		new_inum.subvol			= new_subvol;
-		dir_target			= new_subvol;
-		dir_type			= DT_SUBVOL;
-
-		ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot);
-		if (ret)
-			goto err;
-
-		bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot);
-		ret = bch2_btree_iter_traverse(&dir_iter);
-		if (ret)
-			goto err;
-	}
-
-	if (!(flags & BCH_CREATE_SNAPSHOT)) {
-		if (default_acl) {
-			ret = bch2_set_acl_trans(trans, new_inum, new_inode,
-						 default_acl, ACL_TYPE_DEFAULT);
-			if (ret)
-				goto err;
-		}
-
-		if (acl) {
-			ret = bch2_set_acl_trans(trans, new_inum, new_inode,
-						 acl, ACL_TYPE_ACCESS);
-			if (ret)
-				goto err;
-		}
-	}
-
-	if (!(flags & BCH_CREATE_TMPFILE)) {
-		struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u);
-		u64 dir_offset;
-
-		if (is_subdir_for_nlink(new_inode))
-			dir_u->bi_nlink++;
-		dir_u->bi_mtime = dir_u->bi_ctime = now;
-
-		ret = bch2_inode_write(trans, &dir_iter, dir_u);
-		if (ret)
-			goto err;
-
-		ret = bch2_dirent_create(trans, dir, &dir_hash,
-					 dir_type,
-					 name,
-					 dir_target,
-					 &dir_offset,
-					 STR_HASH_must_create);
-		if (ret)
-			goto err;
-
-		new_inode->bi_dir		= dir_u->bi_inum;
-		new_inode->bi_dir_offset	= dir_offset;
-	}
-
-	inode_iter.flags &= ~BTREE_ITER_all_snapshots;
-	bch2_btree_iter_set_snapshot(&inode_iter, snapshot);
-
-	ret   = bch2_btree_iter_traverse(&inode_iter) ?:
-		bch2_inode_write(trans, &inode_iter, new_inode);
-err:
-	bch2_trans_iter_exit(trans, &inode_iter);
-	bch2_trans_iter_exit(trans, &dir_iter);
-	return ret;
-}
-
-int bch2_link_trans(struct btree_trans *trans,
-		    subvol_inum dir,  struct bch_inode_unpacked *dir_u,
-		    subvol_inum inum, struct bch_inode_unpacked *inode_u,
-		    const struct qstr *name)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter dir_iter = { NULL };
-	struct btree_iter inode_iter = { NULL };
-	struct bch_hash_info dir_hash;
-	u64 now = bch2_current_time(c);
-	u64 dir_offset = 0;
-	int ret;
-
-	if (dir.subvol != inum.subvol)
-		return -EXDEV;
-
-	ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent);
-	if (ret)
-		return ret;
-
-	inode_u->bi_ctime = now;
-	ret = bch2_inode_nlink_inc(inode_u);
-	if (ret)
-		goto err;
-
-	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
-	if (ret)
-		goto err;
-
-	if (bch2_reinherit_attrs(inode_u, dir_u)) {
-		ret = -EXDEV;
-		goto err;
-	}
-
-	dir_u->bi_mtime = dir_u->bi_ctime = now;
-
-	dir_hash = bch2_hash_info_init(c, dir_u);
-
-	ret = bch2_dirent_create(trans, dir, &dir_hash,
-				 mode_to_type(inode_u->bi_mode),
-				 name, inum.inum, &dir_offset,
-				 STR_HASH_must_create);
-	if (ret)
-		goto err;
-
-	inode_u->bi_dir		= dir.inum;
-	inode_u->bi_dir_offset	= dir_offset;
-
-	ret =   bch2_inode_write(trans, &dir_iter, dir_u) ?:
-		bch2_inode_write(trans, &inode_iter, inode_u);
-err:
-	bch2_trans_iter_exit(trans, &dir_iter);
-	bch2_trans_iter_exit(trans, &inode_iter);
-	return ret;
-}
-
-int bch2_unlink_trans(struct btree_trans *trans,
-		      subvol_inum dir,
-		      struct bch_inode_unpacked *dir_u,
-		      struct bch_inode_unpacked *inode_u,
-		      const struct qstr *name,
-		      bool deleting_subvol)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter dir_iter = { NULL };
-	struct btree_iter dirent_iter = { NULL };
-	struct btree_iter inode_iter = { NULL };
-	struct bch_hash_info dir_hash;
-	subvol_inum inum;
-	u64 now = bch2_current_time(c);
-	struct bkey_s_c k;
-	int ret;
-
-	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
-	if (ret)
-		goto err;
-
-	dir_hash = bch2_hash_info_init(c, dir_u);
-
-	ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
-				       name, &inum, BTREE_ITER_intent);
-	if (ret)
-		goto err;
-
-	ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum,
-			      BTREE_ITER_intent);
-	if (ret)
-		goto err;
-
-	if (!deleting_subvol && S_ISDIR(inode_u->bi_mode)) {
-		ret = bch2_empty_dir_trans(trans, inum);
-		if (ret)
-			goto err;
-	}
-
-	if (deleting_subvol && !inode_u->bi_subvol) {
-		ret = -BCH_ERR_ENOENT_not_subvol;
-		goto err;
-	}
-
-	if (inode_u->bi_subvol) {
-		/* Recursive subvolume destroy not allowed (yet?) */
-		ret = bch2_subvol_has_children(trans, inode_u->bi_subvol);
-		if (ret)
-			goto err;
-	}
-
-	if (deleting_subvol || inode_u->bi_subvol) {
-		ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol);
-		if (ret)
-			goto err;
-
-		k = bch2_btree_iter_peek_slot(&dirent_iter);
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
-
-		/*
-		 * If we're deleting a subvolume, we need to really delete the
-		 * dirent, not just emit a whiteout in the current snapshot:
-		 */
-		bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot);
-		ret = bch2_btree_iter_traverse(&dirent_iter);
-		if (ret)
-			goto err;
-	} else {
-		bch2_inode_nlink_dec(trans, inode_u);
-	}
-
-	if (inode_u->bi_dir		== dirent_iter.pos.inode &&
-	    inode_u->bi_dir_offset	== dirent_iter.pos.offset) {
-		inode_u->bi_dir		= 0;
-		inode_u->bi_dir_offset	= 0;
-	}
-
-	dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
-	dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
-
-	ret =   bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
-				    &dir_hash, &dirent_iter,
-				    BTREE_UPDATE_internal_snapshot_node) ?:
-		bch2_inode_write(trans, &dir_iter, dir_u) ?:
-		bch2_inode_write(trans, &inode_iter, inode_u);
-err:
-	bch2_trans_iter_exit(trans, &inode_iter);
-	bch2_trans_iter_exit(trans, &dirent_iter);
-	bch2_trans_iter_exit(trans, &dir_iter);
-	return ret;
-}
-
-bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
-			  struct bch_inode_unpacked *src_u)
-{
-	u64 src, dst;
-	unsigned id;
-	bool ret = false;
-
-	for (id = 0; id < Inode_opt_nr; id++) {
-		/* Skip attributes that were explicitly set on this inode */
-		if (dst_u->bi_fields_set & (1 << id))
-			continue;
-
-		src = bch2_inode_opt_get(src_u, id);
-		dst = bch2_inode_opt_get(dst_u, id);
-
-		if (src == dst)
-			continue;
-
-		bch2_inode_opt_set(dst_u, id, src);
-		ret = true;
-	}
-
-	return ret;
-}
-
-static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_parent)
-{
-	struct btree_iter iter;
-	struct bkey_i_subvolume *s =
-		bch2_bkey_get_mut_typed(trans, &iter,
-			BTREE_ID_subvolumes, POS(0, subvol),
-			BTREE_ITER_cached, subvolume);
-	int ret = PTR_ERR_OR_ZERO(s);
-	if (ret)
-		return ret;
-
-	s->v.fs_path_parent = cpu_to_le32(new_parent);
-	bch2_trans_iter_exit(trans, &iter);
-	return 0;
-}
-
-int bch2_rename_trans(struct btree_trans *trans,
-		      subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u,
-		      subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u,
-		      struct bch_inode_unpacked *src_inode_u,
-		      struct bch_inode_unpacked *dst_inode_u,
-		      const struct qstr *src_name,
-		      const struct qstr *dst_name,
-		      enum bch_rename_mode mode)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter src_dir_iter = { NULL };
-	struct btree_iter dst_dir_iter = { NULL };
-	struct btree_iter src_inode_iter = { NULL };
-	struct btree_iter dst_inode_iter = { NULL };
-	struct bch_hash_info src_hash, dst_hash;
-	subvol_inum src_inum, dst_inum;
-	u64 src_offset, dst_offset;
-	u64 now = bch2_current_time(c);
-	int ret;
-
-	ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir,
-			      BTREE_ITER_intent);
-	if (ret)
-		goto err;
-
-	src_hash = bch2_hash_info_init(c, src_dir_u);
-
-	if (dst_dir.inum	!= src_dir.inum ||
-	    dst_dir.subvol	!= src_dir.subvol) {
-		ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
-				      BTREE_ITER_intent);
-		if (ret)
-			goto err;
-
-		dst_hash = bch2_hash_info_init(c, dst_dir_u);
-	} else {
-		dst_dir_u = src_dir_u;
-		dst_hash = src_hash;
-	}
-
-	ret = bch2_dirent_rename(trans,
-				 src_dir, &src_hash,
-				 dst_dir, &dst_hash,
-				 src_name, &src_inum, &src_offset,
-				 dst_name, &dst_inum, &dst_offset,
-				 mode);
-	if (ret)
-		goto err;
-
-	ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum,
-			      BTREE_ITER_intent);
-	if (ret)
-		goto err;
-
-	if (dst_inum.inum) {
-		ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum,
-				      BTREE_ITER_intent);
-		if (ret)
-			goto err;
-	}
-
-	if (src_inode_u->bi_subvol &&
-	    dst_dir.subvol != src_inode_u->bi_parent_subvol) {
-		ret = subvol_update_parent(trans, src_inode_u->bi_subvol, dst_dir.subvol);
-		if (ret)
-			goto err;
-	}
-
-	if (mode == BCH_RENAME_EXCHANGE &&
-	    dst_inode_u->bi_subvol &&
-	    src_dir.subvol != dst_inode_u->bi_parent_subvol) {
-		ret = subvol_update_parent(trans, dst_inode_u->bi_subvol, src_dir.subvol);
-		if (ret)
-			goto err;
-	}
-
-	/* Can't move across subvolumes, unless it's a subvolume root: */
-	if (src_dir.subvol != dst_dir.subvol &&
-	    (!src_inode_u->bi_subvol ||
-	     (dst_inum.inum && !dst_inode_u->bi_subvol))) {
-		ret = -EXDEV;
-		goto err;
-	}
-
-	if (src_inode_u->bi_parent_subvol)
-		src_inode_u->bi_parent_subvol = dst_dir.subvol;
-
-	if ((mode == BCH_RENAME_EXCHANGE) &&
-	    dst_inode_u->bi_parent_subvol)
-		dst_inode_u->bi_parent_subvol = src_dir.subvol;
-
-	src_inode_u->bi_dir		= dst_dir_u->bi_inum;
-	src_inode_u->bi_dir_offset	= dst_offset;
-
-	if (mode == BCH_RENAME_EXCHANGE) {
-		dst_inode_u->bi_dir		= src_dir_u->bi_inum;
-		dst_inode_u->bi_dir_offset	= src_offset;
-	}
-
-	if (mode == BCH_RENAME_OVERWRITE &&
-	    dst_inode_u->bi_dir		== dst_dir_u->bi_inum &&
-	    dst_inode_u->bi_dir_offset	== src_offset) {
-		dst_inode_u->bi_dir		= 0;
-		dst_inode_u->bi_dir_offset	= 0;
-	}
-
-	if (mode == BCH_RENAME_OVERWRITE) {
-		if (S_ISDIR(src_inode_u->bi_mode) !=
-		    S_ISDIR(dst_inode_u->bi_mode)) {
-			ret = -ENOTDIR;
-			goto err;
-		}
-
-		if (S_ISDIR(dst_inode_u->bi_mode)) {
-			ret = bch2_empty_dir_trans(trans, dst_inum);
-			if (ret)
-				goto err;
-		}
-	}
-
-	if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) &&
-	    S_ISDIR(src_inode_u->bi_mode)) {
-		ret = -EXDEV;
-		goto err;
-	}
-
-	if (mode == BCH_RENAME_EXCHANGE &&
-	    bch2_reinherit_attrs(dst_inode_u, src_dir_u) &&
-	    S_ISDIR(dst_inode_u->bi_mode)) {
-		ret = -EXDEV;
-		goto err;
-	}
-
-	if (is_subdir_for_nlink(src_inode_u)) {
-		src_dir_u->bi_nlink--;
-		dst_dir_u->bi_nlink++;
-	}
-
-	if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
-		dst_dir_u->bi_nlink--;
-		src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
-	}
-
-	if (mode == BCH_RENAME_OVERWRITE)
-		bch2_inode_nlink_dec(trans, dst_inode_u);
-
-	src_dir_u->bi_mtime		= now;
-	src_dir_u->bi_ctime		= now;
-
-	if (src_dir.inum != dst_dir.inum) {
-		dst_dir_u->bi_mtime	= now;
-		dst_dir_u->bi_ctime	= now;
-	}
-
-	src_inode_u->bi_ctime		= now;
-
-	if (dst_inum.inum)
-		dst_inode_u->bi_ctime	= now;
-
-	ret =   bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?:
-		(src_dir.inum != dst_dir.inum
-		 ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u)
-		 : 0) ?:
-		bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?:
-		(dst_inum.inum
-		 ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u)
-		 : 0);
-err:
-	bch2_trans_iter_exit(trans, &dst_inode_iter);
-	bch2_trans_iter_exit(trans, &src_inode_iter);
-	bch2_trans_iter_exit(trans, &dst_dir_iter);
-	bch2_trans_iter_exit(trans, &src_dir_iter);
-	return ret;
-}
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h
deleted file mode 100644
index dde237859514..000000000000
--- a/fs/bcachefs/fs-common.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_COMMON_H
-#define _BCACHEFS_FS_COMMON_H
-
-struct posix_acl;
-
-#define BCH_CREATE_TMPFILE		(1U << 0)
-#define BCH_CREATE_SUBVOL		(1U << 1)
-#define BCH_CREATE_SNAPSHOT		(1U << 2)
-#define BCH_CREATE_SNAPSHOT_RO		(1U << 3)
-
-int bch2_create_trans(struct btree_trans *, subvol_inum,
-		      struct bch_inode_unpacked *,
-		      struct bch_inode_unpacked *,
-		      const struct qstr *,
-		      uid_t, gid_t, umode_t, dev_t,
-		      struct posix_acl *,
-		      struct posix_acl *,
-		      subvol_inum, unsigned);
-
-int bch2_link_trans(struct btree_trans *,
-		    subvol_inum, struct bch_inode_unpacked *,
-		    subvol_inum, struct bch_inode_unpacked *,
-		    const struct qstr *);
-
-int bch2_unlink_trans(struct btree_trans *, subvol_inum,
-		      struct bch_inode_unpacked *,
-		      struct bch_inode_unpacked *,
-		      const struct qstr *, bool);
-
-int bch2_rename_trans(struct btree_trans *,
-		      subvol_inum, struct bch_inode_unpacked *,
-		      subvol_inum, struct bch_inode_unpacked *,
-		      struct bch_inode_unpacked *,
-		      struct bch_inode_unpacked *,
-		      const struct qstr *,
-		      const struct qstr *,
-		      enum bch_rename_mode);
-
-bool bch2_reinherit_attrs(struct bch_inode_unpacked *,
-			  struct bch_inode_unpacked *);
-
-#endif /* _BCACHEFS_FS_COMMON_H */
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
deleted file mode 100644
index b0a33fabadf8..000000000000
--- a/fs/bcachefs/fs-io-buffered.c
+++ /dev/null
@@ -1,1156 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "fs-io.h"
-#include "fs-io-buffered.h"
-#include "fs-io-direct.h"
-#include "fs-io-pagecache.h"
-#include "io_read.h"
-#include "io_write.h"
-
-#include <linux/backing-dev.h>
-#include <linux/pagemap.h>
-#include <linux/writeback.h>
-
-static inline bool bio_full(struct bio *bio, unsigned len)
-{
-	if (bio->bi_vcnt >= bio->bi_max_vecs)
-		return true;
-	if (bio->bi_iter.bi_size > UINT_MAX - len)
-		return true;
-	return false;
-}
-
-/* readpage(s): */
-
-static void bch2_readpages_end_io(struct bio *bio)
-{
-	struct folio_iter fi;
-
-	bio_for_each_folio_all(fi, bio)
-		folio_end_read(fi.folio, bio->bi_status == BLK_STS_OK);
-
-	bio_put(bio);
-}
-
-struct readpages_iter {
-	struct address_space	*mapping;
-	unsigned		idx;
-	folios			folios;
-};
-
-static int readpages_iter_init(struct readpages_iter *iter,
-			       struct readahead_control *ractl)
-{
-	struct folio *folio;
-
-	*iter = (struct readpages_iter) { ractl->mapping };
-
-	while ((folio = __readahead_folio(ractl))) {
-		if (!bch2_folio_create(folio, GFP_KERNEL) ||
-		    darray_push(&iter->folios, folio)) {
-			bch2_folio_release(folio);
-			ractl->_nr_pages += folio_nr_pages(folio);
-			ractl->_index -= folio_nr_pages(folio);
-			return iter->folios.nr ? 0 : -ENOMEM;
-		}
-
-		folio_put(folio);
-	}
-
-	return 0;
-}
-
-static inline struct folio *readpage_iter_peek(struct readpages_iter *iter)
-{
-	if (iter->idx >= iter->folios.nr)
-		return NULL;
-	return iter->folios.data[iter->idx];
-}
-
-static inline void readpage_iter_advance(struct readpages_iter *iter)
-{
-	iter->idx++;
-}
-
-static bool extent_partial_reads_expensive(struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	struct bch_extent_crc_unpacked crc;
-	const union bch_extent_entry *i;
-
-	bkey_for_each_crc(k.k, ptrs, crc, i)
-		if (crc.csum_type || crc.compression_type)
-			return true;
-	return false;
-}
-
-static int readpage_bio_extend(struct btree_trans *trans,
-			       struct readpages_iter *iter,
-			       struct bio *bio,
-			       unsigned sectors_this_extent,
-			       bool get_more)
-{
-	/* Don't hold btree locks while allocating memory: */
-	bch2_trans_unlock(trans);
-
-	while (bio_sectors(bio) < sectors_this_extent &&
-	       bio->bi_vcnt < bio->bi_max_vecs) {
-		struct folio *folio = readpage_iter_peek(iter);
-		int ret;
-
-		if (folio) {
-			readpage_iter_advance(iter);
-		} else {
-			pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
-
-			if (!get_more)
-				break;
-
-			folio = xa_load(&iter->mapping->i_pages, folio_offset);
-			if (folio && !xa_is_value(folio))
-				break;
-
-			folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0);
-			if (!folio)
-				break;
-
-			if (!__bch2_folio_create(folio, GFP_KERNEL)) {
-				folio_put(folio);
-				break;
-			}
-
-			ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL);
-			if (ret) {
-				__bch2_folio_release(folio);
-				folio_put(folio);
-				break;
-			}
-
-			folio_put(folio);
-		}
-
-		BUG_ON(folio_sector(folio) != bio_end_sector(bio));
-
-		BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0));
-	}
-
-	return bch2_trans_relock(trans);
-}
-
-static void bchfs_read(struct btree_trans *trans,
-		       struct bch_read_bio *rbio,
-		       subvol_inum inum,
-		       struct readpages_iter *readpages_iter)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_buf sk;
-	int flags = BCH_READ_RETRY_IF_STALE|
-		BCH_READ_MAY_PROMOTE;
-	u32 snapshot;
-	int ret = 0;
-
-	rbio->c = c;
-	rbio->start_time = local_clock();
-	rbio->subvol = inum.subvol;
-
-	bch2_bkey_buf_init(&sk);
-retry:
-	bch2_trans_begin(trans);
-	iter = (struct btree_iter) { NULL };
-
-	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (ret)
-		goto err;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-			     SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
-			     BTREE_ITER_slots);
-	while (1) {
-		struct bkey_s_c k;
-		unsigned bytes, sectors, offset_into_extent;
-		enum btree_id data_btree = BTREE_ID_extents;
-
-		/*
-		 * read_extent -> io_time_reset may cause a transaction restart
-		 * without returning an error, we need to check for that here:
-		 */
-		ret = bch2_trans_relock(trans);
-		if (ret)
-			break;
-
-		bch2_btree_iter_set_pos(&iter,
-				POS(inum.inum, rbio->bio.bi_iter.bi_sector));
-
-		k = bch2_btree_iter_peek_slot(&iter);
-		ret = bkey_err(k);
-		if (ret)
-			break;
-
-		offset_into_extent = iter.pos.offset -
-			bkey_start_offset(k.k);
-		sectors = k.k->size - offset_into_extent;
-
-		bch2_bkey_buf_reassemble(&sk, c, k);
-
-		ret = bch2_read_indirect_extent(trans, &data_btree,
-					&offset_into_extent, &sk);
-		if (ret)
-			break;
-
-		k = bkey_i_to_s_c(sk.k);
-
-		sectors = min(sectors, k.k->size - offset_into_extent);
-
-		if (readpages_iter) {
-			ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
-						  extent_partial_reads_expensive(k));
-			if (ret)
-				break;
-		}
-
-		bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
-		swap(rbio->bio.bi_iter.bi_size, bytes);
-
-		if (rbio->bio.bi_iter.bi_size == bytes)
-			flags |= BCH_READ_LAST_FRAGMENT;
-
-		bch2_bio_page_state_set(&rbio->bio, k);
-
-		bch2_read_extent(trans, rbio, iter.pos,
-				 data_btree, k, offset_into_extent, flags);
-
-		if (flags & BCH_READ_LAST_FRAGMENT)
-			break;
-
-		swap(rbio->bio.bi_iter.bi_size, bytes);
-		bio_advance(&rbio->bio, bytes);
-
-		ret = btree_trans_too_many_iters(trans);
-		if (ret)
-			break;
-	}
-err:
-	bch2_trans_iter_exit(trans, &iter);
-
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	if (ret) {
-		bch_err_inum_offset_ratelimited(c,
-				iter.pos.inode,
-				iter.pos.offset << 9,
-				"read error %i from btree lookup", ret);
-		rbio->bio.bi_status = BLK_STS_IOERR;
-		bio_endio(&rbio->bio);
-	}
-
-	bch2_bkey_buf_exit(&sk, c);
-}
-
-void bch2_readahead(struct readahead_control *ractl)
-{
-	struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_io_opts opts;
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct folio *folio;
-	struct readpages_iter readpages_iter;
-
-	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-	int ret = readpages_iter_init(&readpages_iter, ractl);
-	if (ret)
-		return;
-
-	bch2_pagecache_add_get(inode);
-
-	while ((folio = readpage_iter_peek(&readpages_iter))) {
-		unsigned n = min_t(unsigned,
-				   readpages_iter.folios.nr -
-				   readpages_iter.idx,
-				   BIO_MAX_VECS);
-		struct bch_read_bio *rbio =
-			rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
-						   GFP_KERNEL, &c->bio_read),
-				  opts);
-
-		readpage_iter_advance(&readpages_iter);
-
-		rbio->bio.bi_iter.bi_sector = folio_sector(folio);
-		rbio->bio.bi_end_io = bch2_readpages_end_io;
-		BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
-
-		bchfs_read(trans, rbio, inode_inum(inode),
-			   &readpages_iter);
-		bch2_trans_unlock(trans);
-	}
-
-	bch2_pagecache_add_put(inode);
-
-	bch2_trans_put(trans);
-	darray_exit(&readpages_iter.folios);
-}
-
-static void bch2_read_single_folio_end_io(struct bio *bio)
-{
-	complete(bio->bi_private);
-}
-
-int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
-{
-	struct bch_inode_info *inode = to_bch_ei(mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_read_bio *rbio;
-	struct bch_io_opts opts;
-	int ret;
-	DECLARE_COMPLETION_ONSTACK(done);
-
-	if (!bch2_folio_create(folio, GFP_KERNEL))
-		return -ENOMEM;
-
-	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-	rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
-			 opts);
-	rbio->bio.bi_private = &done;
-	rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
-
-	rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
-	rbio->bio.bi_iter.bi_sector = folio_sector(folio);
-	BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
-
-	bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0));
-	wait_for_completion(&done);
-
-	ret = blk_status_to_errno(rbio->bio.bi_status);
-	bio_put(&rbio->bio);
-
-	if (ret < 0)
-		return ret;
-
-	folio_mark_uptodate(folio);
-	return 0;
-}
-
-int bch2_read_folio(struct file *file, struct folio *folio)
-{
-	int ret;
-
-	ret = bch2_read_single_folio(folio, folio->mapping);
-	folio_unlock(folio);
-	return bch2_err_class(ret);
-}
-
-/* writepages: */
-
-struct bch_writepage_io {
-	struct bch_inode_info		*inode;
-
-	/* must be last: */
-	struct bch_write_op		op;
-};
-
-struct bch_writepage_state {
-	struct bch_writepage_io	*io;
-	struct bch_io_opts	opts;
-	struct bch_folio_sector	*tmp;
-	unsigned		tmp_sectors;
-};
-
-static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
-								  struct bch_inode_info *inode)
-{
-	struct bch_writepage_state ret = { 0 };
-
-	bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode);
-	return ret;
-}
-
-/*
- * Determine when a writepage io is full. We have to limit writepage bios to a
- * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to
- * what the bounce path in bch2_write_extent() can handle. In theory we could
- * loosen this restriction for non-bounce I/O, but we don't have that context
- * here. Ideally, we can up this limit and make it configurable in the future
- * when the bounce path can be enhanced to accommodate larger source bios.
- */
-static inline bool bch_io_full(struct bch_writepage_io *io, unsigned len)
-{
-	struct bio *bio = &io->op.wbio.bio;
-	return bio_full(bio, len) ||
-		(bio->bi_iter.bi_size + len > BIO_MAX_VECS * PAGE_SIZE);
-}
-
-static void bch2_writepage_io_done(struct bch_write_op *op)
-{
-	struct bch_writepage_io *io =
-		container_of(op, struct bch_writepage_io, op);
-	struct bch_fs *c = io->op.c;
-	struct bio *bio = &io->op.wbio.bio;
-	struct folio_iter fi;
-	unsigned i;
-
-	if (io->op.error) {
-		set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
-
-		bio_for_each_folio_all(fi, bio) {
-			struct bch_folio *s;
-
-			mapping_set_error(fi.folio->mapping, -EIO);
-
-			s = __bch2_folio(fi.folio);
-			spin_lock(&s->lock);
-			for (i = 0; i < folio_sectors(fi.folio); i++)
-				s->s[i].nr_replicas = 0;
-			spin_unlock(&s->lock);
-		}
-	}
-
-	if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
-		bio_for_each_folio_all(fi, bio) {
-			struct bch_folio *s;
-
-			s = __bch2_folio(fi.folio);
-			spin_lock(&s->lock);
-			for (i = 0; i < folio_sectors(fi.folio); i++)
-				s->s[i].nr_replicas = 0;
-			spin_unlock(&s->lock);
-		}
-	}
-
-	/*
-	 * racing with fallocate can cause us to add fewer sectors than
-	 * expected - but we shouldn't add more sectors than expected:
-	 */
-	WARN_ON_ONCE(io->op.i_sectors_delta > 0);
-
-	/*
-	 * (error (due to going RO) halfway through a page can screw that up
-	 * slightly)
-	 * XXX wtf?
-	   BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
-	 */
-
-	/*
-	 * PageWriteback is effectively our ref on the inode - fixup i_blocks
-	 * before calling end_page_writeback:
-	 */
-	bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
-
-	bio_for_each_folio_all(fi, bio) {
-		struct bch_folio *s = __bch2_folio(fi.folio);
-
-		if (atomic_dec_and_test(&s->write_count))
-			folio_end_writeback(fi.folio);
-	}
-
-	bio_put(&io->op.wbio.bio);
-}
-
-static void bch2_writepage_do_io(struct bch_writepage_state *w)
-{
-	struct bch_writepage_io *io = w->io;
-
-	w->io = NULL;
-	closure_call(&io->op.cl, bch2_write, NULL, NULL);
-}
-
-/*
- * Get a bch_writepage_io and add @page to it - appending to an existing one if
- * possible, else allocating a new one:
- */
-static void bch2_writepage_io_alloc(struct bch_fs *c,
-				    struct writeback_control *wbc,
-				    struct bch_writepage_state *w,
-				    struct bch_inode_info *inode,
-				    u64 sector,
-				    unsigned nr_replicas)
-{
-	struct bch_write_op *op;
-
-	w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
-					      REQ_OP_WRITE,
-					      GFP_KERNEL,
-					      &c->writepage_bioset),
-			     struct bch_writepage_io, op.wbio.bio);
-
-	w->io->inode		= inode;
-	op			= &w->io->op;
-	bch2_write_op_init(op, c, w->opts);
-	op->target		= w->opts.foreground_target;
-	op->nr_replicas		= nr_replicas;
-	op->res.nr_replicas	= nr_replicas;
-	op->write_point		= writepoint_hashed(inode->ei_last_dirtied);
-	op->subvol		= inode->ei_subvol;
-	op->pos			= POS(inode->v.i_ino, sector);
-	op->end_io		= bch2_writepage_io_done;
-	op->devs_need_flush	= &inode->ei_devs_need_flush;
-	op->wbio.bio.bi_iter.bi_sector = sector;
-	op->wbio.bio.bi_opf	= wbc_to_write_flags(wbc);
-}
-
-static int __bch2_writepage(struct folio *folio,
-			    struct writeback_control *wbc,
-			    void *data)
-{
-	struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_writepage_state *w = data;
-	struct bch_folio *s;
-	unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX;
-	loff_t i_size = i_size_read(&inode->v);
-	int ret;
-
-	EBUG_ON(!folio_test_uptodate(folio));
-
-	/* Is the folio fully inside i_size? */
-	if (folio_end_pos(folio) <= i_size)
-		goto do_io;
-
-	/* Is the folio fully outside i_size? (truncate in progress) */
-	if (folio_pos(folio) >= i_size) {
-		folio_unlock(folio);
-		return 0;
-	}
-
-	/*
-	 * The folio straddles i_size.  It must be zeroed out on each and every
-	 * writepage invocation because it may be mmapped.  "A file is mapped
-	 * in multiples of the folio size.  For a file that is not a multiple of
-	 * the  folio size, the remaining memory is zeroed when mapped, and
-	 * writes to that region are not written out to the file."
-	 */
-	folio_zero_segment(folio,
-			   i_size - folio_pos(folio),
-			   folio_size(folio));
-do_io:
-	f_sectors = folio_sectors(folio);
-	s = bch2_folio(folio);
-
-	if (f_sectors > w->tmp_sectors) {
-		kfree(w->tmp);
-		w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), __GFP_NOFAIL);
-		w->tmp_sectors = f_sectors;
-	}
-
-	/*
-	 * Things get really hairy with errors during writeback:
-	 */
-	ret = bch2_get_folio_disk_reservation(c, inode, folio, false);
-	BUG_ON(ret);
-
-	/* Before unlocking the page, get copy of reservations: */
-	spin_lock(&s->lock);
-	memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors);
-
-	for (i = 0; i < f_sectors; i++) {
-		if (s->s[i].state < SECTOR_dirty)
-			continue;
-
-		nr_replicas_this_write =
-			min_t(unsigned, nr_replicas_this_write,
-			      s->s[i].nr_replicas +
-			      s->s[i].replicas_reserved);
-	}
-
-	for (i = 0; i < f_sectors; i++) {
-		if (s->s[i].state < SECTOR_dirty)
-			continue;
-
-		s->s[i].nr_replicas = w->opts.compression
-			? 0 : nr_replicas_this_write;
-
-		s->s[i].replicas_reserved = 0;
-		bch2_folio_sector_set(folio, s, i, SECTOR_allocated);
-	}
-	spin_unlock(&s->lock);
-
-	BUG_ON(atomic_read(&s->write_count));
-	atomic_set(&s->write_count, 1);
-
-	BUG_ON(folio_test_writeback(folio));
-	folio_start_writeback(folio);
-
-	folio_unlock(folio);
-
-	offset = 0;
-	while (1) {
-		unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
-		u64 sector;
-
-		while (offset < f_sectors &&
-		       w->tmp[offset].state < SECTOR_dirty)
-			offset++;
-
-		if (offset == f_sectors)
-			break;
-
-		while (offset + sectors < f_sectors &&
-		       w->tmp[offset + sectors].state >= SECTOR_dirty) {
-			reserved_sectors += w->tmp[offset + sectors].replicas_reserved;
-			dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty;
-			sectors++;
-		}
-		BUG_ON(!sectors);
-
-		sector = folio_sector(folio) + offset;
-
-		if (w->io &&
-		    (w->io->op.res.nr_replicas != nr_replicas_this_write ||
-		     bch_io_full(w->io, sectors << 9) ||
-		     bio_end_sector(&w->io->op.wbio.bio) != sector))
-			bch2_writepage_do_io(w);
-
-		if (!w->io)
-			bch2_writepage_io_alloc(c, wbc, w, inode, sector,
-						nr_replicas_this_write);
-
-		atomic_inc(&s->write_count);
-
-		BUG_ON(inode != w->io->inode);
-		BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
-				     sectors << 9, offset << 9));
-
-		/* Check for writing past i_size: */
-		WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
-			  round_up(i_size, block_bytes(c)) &&
-			  !test_bit(BCH_FS_emergency_ro, &c->flags),
-			  "writing past i_size: %llu > %llu (unrounded %llu)\n",
-			  bio_end_sector(&w->io->op.wbio.bio) << 9,
-			  round_up(i_size, block_bytes(c)),
-			  i_size);
-
-		w->io->op.res.sectors += reserved_sectors;
-		w->io->op.i_sectors_delta -= dirty_sectors;
-		w->io->op.new_i_size = i_size;
-
-		offset += sectors;
-	}
-
-	if (atomic_dec_and_test(&s->write_count))
-		folio_end_writeback(folio);
-
-	return 0;
-}
-
-int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
-{
-	struct bch_fs *c = mapping->host->i_sb->s_fs_info;
-	struct bch_writepage_state w =
-		bch_writepage_state_init(c, to_bch_ei(mapping->host));
-	struct blk_plug plug;
-	int ret;
-
-	blk_start_plug(&plug);
-	ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
-	if (w.io)
-		bch2_writepage_do_io(&w);
-	blk_finish_plug(&plug);
-	kfree(w.tmp);
-	return bch2_err_class(ret);
-}
-
-/* buffered writes: */
-
-int bch2_write_begin(struct file *file, struct address_space *mapping,
-		     loff_t pos, unsigned len,
-		     struct page **pagep, void **fsdata)
-{
-	struct bch_inode_info *inode = to_bch_ei(mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch2_folio_reservation *res;
-	struct folio *folio;
-	unsigned offset;
-	int ret = -ENOMEM;
-
-	res = kmalloc(sizeof(*res), GFP_KERNEL);
-	if (!res)
-		return -ENOMEM;
-
-	bch2_folio_reservation_init(c, inode, res);
-	*fsdata = res;
-
-	bch2_pagecache_add_get(inode);
-
-	folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
-				FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE,
-				mapping_gfp_mask(mapping));
-	if (IS_ERR_OR_NULL(folio))
-		goto err_unlock;
-
-	offset = pos - folio_pos(folio);
-	len = min_t(size_t, len, folio_end_pos(folio) - pos);
-
-	if (folio_test_uptodate(folio))
-		goto out;
-
-	/* If we're writing entire folio, don't need to read it in first: */
-	if (!offset && len == folio_size(folio))
-		goto out;
-
-	if (!offset && pos + len >= inode->v.i_size) {
-		folio_zero_segment(folio, len, folio_size(folio));
-		flush_dcache_folio(folio);
-		goto out;
-	}
-
-	if (folio_pos(folio) >= inode->v.i_size) {
-		folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio));
-		flush_dcache_folio(folio);
-		goto out;
-	}
-readpage:
-	ret = bch2_read_single_folio(folio, mapping);
-	if (ret)
-		goto err;
-out:
-	ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
-	if (ret)
-		goto err;
-
-	ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len);
-	if (ret) {
-		if (!folio_test_uptodate(folio)) {
-			/*
-			 * If the folio hasn't been read in, we won't know if we
-			 * actually need a reservation - we don't actually need
-			 * to read here, we just need to check if the folio is
-			 * fully backed by uncompressed data:
-			 */
-			goto readpage;
-		}
-
-		goto err;
-	}
-
-	*pagep = &folio->page;
-	return 0;
-err:
-	folio_unlock(folio);
-	folio_put(folio);
-	*pagep = NULL;
-err_unlock:
-	bch2_pagecache_add_put(inode);
-	kfree(res);
-	*fsdata = NULL;
-	return bch2_err_class(ret);
-}
-
-int bch2_write_end(struct file *file, struct address_space *mapping,
-		   loff_t pos, unsigned len, unsigned copied,
-		   struct page *page, void *fsdata)
-{
-	struct bch_inode_info *inode = to_bch_ei(mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch2_folio_reservation *res = fsdata;
-	struct folio *folio = page_folio(page);
-	unsigned offset = pos - folio_pos(folio);
-
-	lockdep_assert_held(&inode->v.i_rwsem);
-	BUG_ON(offset + copied > folio_size(folio));
-
-	if (unlikely(copied < len && !folio_test_uptodate(folio))) {
-		/*
-		 * The folio needs to be read in, but that would destroy
-		 * our partial write - simplest thing is to just force
-		 * userspace to redo the write:
-		 */
-		folio_zero_range(folio, 0, folio_size(folio));
-		flush_dcache_folio(folio);
-		copied = 0;
-	}
-
-	spin_lock(&inode->v.i_lock);
-	if (pos + copied > inode->v.i_size)
-		i_size_write(&inode->v, pos + copied);
-	spin_unlock(&inode->v.i_lock);
-
-	if (copied) {
-		if (!folio_test_uptodate(folio))
-			folio_mark_uptodate(folio);
-
-		bch2_set_folio_dirty(c, inode, folio, res, offset, copied);
-
-		inode->ei_last_dirtied = (unsigned long) current;
-	}
-
-	folio_unlock(folio);
-	folio_put(folio);
-	bch2_pagecache_add_put(inode);
-
-	bch2_folio_reservation_put(c, inode, res);
-	kfree(res);
-
-	return copied;
-}
-
-static noinline void folios_trunc(folios *fs, struct folio **fi)
-{
-	while (fs->data + fs->nr > fi) {
-		struct folio *f = darray_pop(fs);
-
-		folio_unlock(f);
-		folio_put(f);
-	}
-}
-
-static int __bch2_buffered_write(struct bch_inode_info *inode,
-				 struct address_space *mapping,
-				 struct iov_iter *iter,
-				 loff_t pos, unsigned len,
-				 bool inode_locked)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch2_folio_reservation res;
-	folios fs;
-	struct folio *f;
-	unsigned copied = 0, f_offset, f_copied;
-	u64 end = pos + len, f_pos, f_len;
-	loff_t last_folio_pos = inode->v.i_size;
-	int ret = 0;
-
-	BUG_ON(!len);
-
-	bch2_folio_reservation_init(c, inode, &res);
-	darray_init(&fs);
-
-	ret = bch2_filemap_get_contig_folios_d(mapping, pos, end,
-				   FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT,
-				   mapping_gfp_mask(mapping),
-				   &fs);
-	if (ret)
-		goto out;
-
-	BUG_ON(!fs.nr);
-
-	/*
-	 * If we're not using the inode lock, we need to lock all the folios for
-	 * atomiticity of writes vs. other writes:
-	 */
-	if (!inode_locked && folio_end_pos(darray_last(fs)) < end) {
-		ret = -BCH_ERR_need_inode_lock;
-		goto out;
-	}
-
-	f = darray_first(fs);
-	if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
-		ret = bch2_read_single_folio(f, mapping);
-		if (ret)
-			goto out;
-	}
-
-	f = darray_last(fs);
-	end = min(end, folio_end_pos(f));
-	last_folio_pos = folio_pos(f);
-	if (end != folio_end_pos(f) && !folio_test_uptodate(f)) {
-		if (end >= inode->v.i_size) {
-			folio_zero_range(f, 0, folio_size(f));
-		} else {
-			ret = bch2_read_single_folio(f, mapping);
-			if (ret)
-				goto out;
-		}
-	}
-
-	ret = bch2_folio_set(c, inode_inum(inode), fs.data, fs.nr);
-	if (ret)
-		goto out;
-
-	f_pos = pos;
-	f_offset = pos - folio_pos(darray_first(fs));
-	darray_for_each(fs, fi) {
-		f = *fi;
-		f_len = min(end, folio_end_pos(f)) - f_pos;
-
-		/*
-		 * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
-		 * supposed to write as much as we have disk space for.
-		 *
-		 * On failure here we should still write out a partial page if
-		 * we aren't completely out of disk space - we don't do that
-		 * yet:
-		 */
-		ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len);
-		if (unlikely(ret)) {
-			folios_trunc(&fs, fi);
-			if (!fs.nr)
-				goto out;
-
-			end = min(end, folio_end_pos(darray_last(fs)));
-			break;
-		}
-
-		f_pos = folio_end_pos(f);
-		f_offset = 0;
-	}
-
-	if (mapping_writably_mapped(mapping))
-		darray_for_each(fs, fi)
-			flush_dcache_folio(*fi);
-
-	f_pos = pos;
-	f_offset = pos - folio_pos(darray_first(fs));
-	darray_for_each(fs, fi) {
-		f = *fi;
-		f_len = min(end, folio_end_pos(f)) - f_pos;
-		f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter);
-		if (!f_copied) {
-			folios_trunc(&fs, fi);
-			break;
-		}
-
-		if (!folio_test_uptodate(f) &&
-		    f_copied != folio_size(f) &&
-		    pos + copied + f_copied < inode->v.i_size) {
-			iov_iter_revert(iter, f_copied);
-			folio_zero_range(f, 0, folio_size(f));
-			folios_trunc(&fs, fi);
-			break;
-		}
-
-		flush_dcache_folio(f);
-		copied += f_copied;
-
-		if (f_copied != f_len) {
-			folios_trunc(&fs, fi + 1);
-			break;
-		}
-
-		f_pos = folio_end_pos(f);
-		f_offset = 0;
-	}
-
-	if (!copied)
-		goto out;
-
-	end = pos + copied;
-
-	spin_lock(&inode->v.i_lock);
-	if (end > inode->v.i_size) {
-		BUG_ON(!inode_locked);
-		i_size_write(&inode->v, end);
-	}
-	spin_unlock(&inode->v.i_lock);
-
-	f_pos = pos;
-	f_offset = pos - folio_pos(darray_first(fs));
-	darray_for_each(fs, fi) {
-		f = *fi;
-		f_len = min(end, folio_end_pos(f)) - f_pos;
-
-		if (!folio_test_uptodate(f))
-			folio_mark_uptodate(f);
-
-		bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len);
-
-		f_pos = folio_end_pos(f);
-		f_offset = 0;
-	}
-
-	inode->ei_last_dirtied = (unsigned long) current;
-out:
-	darray_for_each(fs, fi) {
-		folio_unlock(*fi);
-		folio_put(*fi);
-	}
-
-	/*
-	 * If the last folio added to the mapping starts beyond current EOF, we
-	 * performed a short write but left around at least one post-EOF folio.
-	 * Clean up the mapping before we return.
-	 */
-	if (last_folio_pos >= inode->v.i_size)
-		truncate_pagecache(&inode->v, inode->v.i_size);
-
-	darray_exit(&fs);
-	bch2_folio_reservation_put(c, inode, &res);
-
-	return copied ?: ret;
-}
-
-static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
-{
-	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	loff_t pos;
-	bool inode_locked = false;
-	ssize_t written = 0, written2 = 0, ret = 0;
-
-	/*
-	 * We don't take the inode lock unless i_size will be changing. Folio
-	 * locks provide exclusion with other writes, and the pagecache add lock
-	 * provides exclusion with truncate and hole punching.
-	 *
-	 * There is one nasty corner case where atomicity would be broken
-	 * without great care: when copying data from userspace to the page
-	 * cache, we do that with faults disable - a page fault would recurse
-	 * back into the filesystem, taking filesystem locks again, and
-	 * deadlock; so it's done with faults disabled, and we fault in the user
-	 * buffer when we aren't holding locks.
-	 *
-	 * If we do part of the write, but we then race and in the userspace
-	 * buffer have been evicted and are no longer resident, then we have to
-	 * drop our folio locks to re-fault them in, breaking write atomicity.
-	 *
-	 * To fix this, we restart the write from the start, if we weren't
-	 * holding the inode lock.
-	 *
-	 * There is another wrinkle after that; if we restart the write from the
-	 * start, and then get an unrecoverable error, we _cannot_ claim to
-	 * userspace that we did not write data we actually did - so we must
-	 * track (written2) the most we ever wrote.
-	 */
-
-	if ((iocb->ki_flags & IOCB_APPEND) ||
-	    (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) {
-		inode_lock(&inode->v);
-		inode_locked = true;
-	}
-
-	ret = generic_write_checks(iocb, iter);
-	if (ret <= 0)
-		goto unlock;
-
-	ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0);
-	if (ret) {
-		if (!inode_locked) {
-			inode_lock(&inode->v);
-			inode_locked = true;
-			ret = file_remove_privs_flags(file, 0);
-		}
-		if (ret)
-			goto unlock;
-	}
-
-	ret = file_update_time(file);
-	if (ret)
-		goto unlock;
-
-	pos = iocb->ki_pos;
-
-	bch2_pagecache_add_get(inode);
-
-	if (!inode_locked &&
-	    (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v)))
-		goto get_inode_lock;
-
-	do {
-		unsigned offset = pos & (PAGE_SIZE - 1);
-		unsigned bytes = iov_iter_count(iter);
-again:
-		/*
-		 * Bring in the user page that we will copy from _first_.
-		 * Otherwise there's a nasty deadlock on copying from the
-		 * same page as we're writing to, without it being marked
-		 * up-to-date.
-		 *
-		 * Not only is this an optimisation, but it is also required
-		 * to check that the address is actually valid, when atomic
-		 * usercopies are used, below.
-		 */
-		if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
-			bytes = min_t(unsigned long, iov_iter_count(iter),
-				      PAGE_SIZE - offset);
-
-			if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
-				ret = -EFAULT;
-				break;
-			}
-		}
-
-		if (unlikely(bytes != iov_iter_count(iter) && !inode_locked))
-			goto get_inode_lock;
-
-		if (unlikely(fatal_signal_pending(current))) {
-			ret = -EINTR;
-			break;
-		}
-
-		ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked);
-		if (ret == -BCH_ERR_need_inode_lock)
-			goto get_inode_lock;
-		if (unlikely(ret < 0))
-			break;
-
-		cond_resched();
-
-		if (unlikely(ret == 0)) {
-			/*
-			 * If we were unable to copy any data at all, we must
-			 * fall back to a single segment length write.
-			 *
-			 * If we didn't fallback here, we could livelock
-			 * because not all segments in the iov can be copied at
-			 * once without a pagefault.
-			 */
-			bytes = min_t(unsigned long, PAGE_SIZE - offset,
-				      iov_iter_single_seg_count(iter));
-			goto again;
-		}
-		pos += ret;
-		written += ret;
-		written2 = max(written, written2);
-
-		if (ret != bytes && !inode_locked)
-			goto get_inode_lock;
-		ret = 0;
-
-		balance_dirty_pages_ratelimited(mapping);
-
-		if (0) {
-get_inode_lock:
-			bch2_pagecache_add_put(inode);
-			inode_lock(&inode->v);
-			inode_locked = true;
-			bch2_pagecache_add_get(inode);
-
-			iov_iter_revert(iter, written);
-			pos -= written;
-			written = 0;
-			ret = 0;
-		}
-	} while (iov_iter_count(iter));
-	bch2_pagecache_add_put(inode);
-unlock:
-	if (inode_locked)
-		inode_unlock(&inode->v);
-
-	iocb->ki_pos += written;
-
-	ret = max(written, written2) ?: ret;
-	if (ret > 0)
-		ret = generic_write_sync(iocb, ret);
-	return ret;
-}
-
-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter)
-{
-	ssize_t ret = iocb->ki_flags & IOCB_DIRECT
-		? bch2_direct_write(iocb, iter)
-		: bch2_buffered_write(iocb, iter);
-
-	return bch2_err_class(ret);
-}
-
-void bch2_fs_fs_io_buffered_exit(struct bch_fs *c)
-{
-	bioset_exit(&c->writepage_bioset);
-}
-
-int bch2_fs_fs_io_buffered_init(struct bch_fs *c)
-{
-	if (bioset_init(&c->writepage_bioset,
-			4, offsetof(struct bch_writepage_io, op.wbio.bio),
-			BIOSET_NEED_BVECS))
-		return -BCH_ERR_ENOMEM_writepage_bioset_init;
-
-	return 0;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h
deleted file mode 100644
index a6126ff790e6..000000000000
--- a/fs/bcachefs/fs-io-buffered.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IO_BUFFERED_H
-#define _BCACHEFS_FS_IO_BUFFERED_H
-
-#ifndef NO_BCACHEFS_FS
-
-int bch2_read_single_folio(struct folio *, struct address_space *);
-int bch2_read_folio(struct file *, struct folio *);
-
-int bch2_writepages(struct address_space *, struct writeback_control *);
-void bch2_readahead(struct readahead_control *);
-
-int bch2_write_begin(struct file *, struct address_space *, loff_t,
-		     unsigned, struct page **, void **);
-int bch2_write_end(struct file *, struct address_space *, loff_t,
-		   unsigned, unsigned, struct page *, void *);
-
-ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
-
-void bch2_fs_fs_io_buffered_exit(struct bch_fs *);
-int bch2_fs_fs_io_buffered_init(struct bch_fs *);
-#else
-static inline void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) {}
-static inline int bch2_fs_fs_io_buffered_init(struct bch_fs *c) { return 0; }
-#endif
-
-#endif /* _BCACHEFS_FS_IO_BUFFERED_H */
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
deleted file mode 100644
index 09d21aef879a..000000000000
--- a/fs/bcachefs/fs-io-direct.c
+++ /dev/null
@@ -1,687 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "fs.h"
-#include "fs-io.h"
-#include "fs-io-direct.h"
-#include "fs-io-pagecache.h"
-#include "io_read.h"
-#include "io_write.h"
-
-#include <linux/kthread.h>
-#include <linux/pagemap.h>
-#include <linux/prefetch.h>
-#include <linux/task_io_accounting_ops.h>
-
-/* O_DIRECT reads */
-
-struct dio_read {
-	struct closure			cl;
-	struct kiocb			*req;
-	long				ret;
-	bool				should_dirty;
-	struct bch_read_bio		rbio;
-};
-
-static void bio_check_or_release(struct bio *bio, bool check_dirty)
-{
-	if (check_dirty) {
-		bio_check_pages_dirty(bio);
-	} else {
-		bio_release_pages(bio, false);
-		bio_put(bio);
-	}
-}
-
-static CLOSURE_CALLBACK(bch2_dio_read_complete)
-{
-	closure_type(dio, struct dio_read, cl);
-
-	dio->req->ki_complete(dio->req, dio->ret);
-	bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
-}
-
-static void bch2_direct_IO_read_endio(struct bio *bio)
-{
-	struct dio_read *dio = bio->bi_private;
-
-	if (bio->bi_status)
-		dio->ret = blk_status_to_errno(bio->bi_status);
-
-	closure_put(&dio->cl);
-}
-
-static void bch2_direct_IO_read_split_endio(struct bio *bio)
-{
-	struct dio_read *dio = bio->bi_private;
-	bool should_dirty = dio->should_dirty;
-
-	bch2_direct_IO_read_endio(bio);
-	bio_check_or_release(bio, should_dirty);
-}
-
-static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
-{
-	struct file *file = req->ki_filp;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_io_opts opts;
-	struct dio_read *dio;
-	struct bio *bio;
-	loff_t offset = req->ki_pos;
-	bool sync = is_sync_kiocb(req);
-	size_t shorten;
-	ssize_t ret;
-
-	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-	/* bios must be 512 byte aligned: */
-	if ((offset|iter->count) & (SECTOR_SIZE - 1))
-		return -EINVAL;
-
-	ret = min_t(loff_t, iter->count,
-		    max_t(loff_t, 0, i_size_read(&inode->v) - offset));
-
-	if (!ret)
-		return ret;
-
-	shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
-	if (shorten >= iter->count)
-		shorten = 0;
-	iter->count -= shorten;
-
-	bio = bio_alloc_bioset(NULL,
-			       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
-			       REQ_OP_READ,
-			       GFP_KERNEL,
-			       &c->dio_read_bioset);
-
-	bio->bi_end_io = bch2_direct_IO_read_endio;
-
-	dio = container_of(bio, struct dio_read, rbio.bio);
-	closure_init(&dio->cl, NULL);
-
-	/*
-	 * this is a _really_ horrible hack just to avoid an atomic sub at the
-	 * end:
-	 */
-	if (!sync) {
-		set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
-		atomic_set(&dio->cl.remaining,
-			   CLOSURE_REMAINING_INITIALIZER -
-			   CLOSURE_RUNNING +
-			   CLOSURE_DESTRUCTOR);
-	} else {
-		atomic_set(&dio->cl.remaining,
-			   CLOSURE_REMAINING_INITIALIZER + 1);
-		dio->cl.closure_get_happened = true;
-	}
-
-	dio->req	= req;
-	dio->ret	= ret;
-	/*
-	 * This is one of the sketchier things I've encountered: we have to skip
-	 * the dirtying of requests that are internal from the kernel (i.e. from
-	 * loopback), because we'll deadlock on page_lock.
-	 */
-	dio->should_dirty = iter_is_iovec(iter);
-
-	goto start;
-	while (iter->count) {
-		bio = bio_alloc_bioset(NULL,
-				       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
-				       REQ_OP_READ,
-				       GFP_KERNEL,
-				       &c->bio_read);
-		bio->bi_end_io		= bch2_direct_IO_read_split_endio;
-start:
-		bio->bi_opf		= REQ_OP_READ|REQ_SYNC;
-		bio->bi_iter.bi_sector	= offset >> 9;
-		bio->bi_private		= dio;
-
-		ret = bio_iov_iter_get_pages(bio, iter);
-		if (ret < 0) {
-			/* XXX: fault inject this path */
-			bio->bi_status = BLK_STS_RESOURCE;
-			bio_endio(bio);
-			break;
-		}
-
-		offset += bio->bi_iter.bi_size;
-
-		if (dio->should_dirty)
-			bio_set_pages_dirty(bio);
-
-		if (iter->count)
-			closure_get(&dio->cl);
-
-		bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
-	}
-
-	iter->count += shorten;
-
-	if (sync) {
-		closure_sync(&dio->cl);
-		closure_debug_destroy(&dio->cl);
-		ret = dio->ret;
-		bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
-		return ret;
-	} else {
-		return -EIOCBQUEUED;
-	}
-}
-
-ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
-{
-	struct file *file = iocb->ki_filp;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct address_space *mapping = file->f_mapping;
-	size_t count = iov_iter_count(iter);
-	ssize_t ret;
-
-	if (!count)
-		return 0; /* skip atime */
-
-	if (iocb->ki_flags & IOCB_DIRECT) {
-		struct blk_plug plug;
-
-		if (unlikely(mapping->nrpages)) {
-			ret = filemap_write_and_wait_range(mapping,
-						iocb->ki_pos,
-						iocb->ki_pos + count - 1);
-			if (ret < 0)
-				goto out;
-		}
-
-		file_accessed(file);
-
-		blk_start_plug(&plug);
-		ret = bch2_direct_IO_read(iocb, iter);
-		blk_finish_plug(&plug);
-
-		if (ret >= 0)
-			iocb->ki_pos += ret;
-	} else {
-		bch2_pagecache_add_get(inode);
-		ret = generic_file_read_iter(iocb, iter);
-		bch2_pagecache_add_put(inode);
-	}
-out:
-	return bch2_err_class(ret);
-}
-
-/* O_DIRECT writes */
-
-struct dio_write {
-	struct kiocb			*req;
-	struct address_space		*mapping;
-	struct bch_inode_info		*inode;
-	struct mm_struct		*mm;
-	const struct iovec		*iov;
-	unsigned			loop:1,
-					extending:1,
-					sync:1,
-					flush:1;
-	struct quota_res		quota_res;
-	u64				written;
-
-	struct iov_iter			iter;
-	struct iovec			inline_vecs[2];
-
-	/* must be last: */
-	struct bch_write_op		op;
-};
-
-static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
-				       u64 offset, u64 size,
-				       unsigned nr_replicas, bool compressed)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	u64 end = offset + size;
-	u32 snapshot;
-	bool ret = true;
-	int err;
-retry:
-	bch2_trans_begin(trans);
-
-	err = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (err)
-		goto err;
-
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
-			   SPOS(inum.inum, offset, snapshot),
-			   BTREE_ITER_slots, k, err) {
-		if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
-			break;
-
-		if (k.k->p.snapshot != snapshot ||
-		    nr_replicas > bch2_bkey_replicas(c, k) ||
-		    (!compressed && bch2_bkey_sectors_compressed(k))) {
-			ret = false;
-			break;
-		}
-	}
-
-	offset = iter.pos.offset;
-	bch2_trans_iter_exit(trans, &iter);
-err:
-	if (bch2_err_matches(err, BCH_ERR_transaction_restart))
-		goto retry;
-	bch2_trans_put(trans);
-
-	return err ? false : ret;
-}
-
-static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
-{
-	struct bch_fs *c = dio->op.c;
-	struct bch_inode_info *inode = dio->inode;
-	struct bio *bio = &dio->op.wbio.bio;
-
-	return bch2_check_range_allocated(c, inode_inum(inode),
-				dio->op.pos.offset, bio_sectors(bio),
-				dio->op.opts.data_replicas,
-				dio->op.opts.compression != 0);
-}
-
-static void bch2_dio_write_loop_async(struct bch_write_op *);
-static __always_inline long bch2_dio_write_done(struct dio_write *dio);
-
-/*
- * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
- * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
- * caller's stack, we're not guaranteed that it will live for the duration of
- * the IO:
- */
-static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
-{
-	struct iovec *iov = dio->inline_vecs;
-
-	/*
-	 * iov_iter has a single embedded iovec - nothing to do:
-	 */
-	if (iter_is_ubuf(&dio->iter))
-		return 0;
-
-	/*
-	 * We don't currently handle non-iovec iov_iters here - return an error,
-	 * and we'll fall back to doing the IO synchronously:
-	 */
-	if (!iter_is_iovec(&dio->iter))
-		return -1;
-
-	if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
-		dio->iov = iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
-				    GFP_KERNEL);
-		if (unlikely(!iov))
-			return -ENOMEM;
-	}
-
-	memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov));
-	dio->iter.__iov = iov;
-	return 0;
-}
-
-static CLOSURE_CALLBACK(bch2_dio_write_flush_done)
-{
-	closure_type(dio, struct dio_write, op.cl);
-	struct bch_fs *c = dio->op.c;
-
-	closure_debug_destroy(cl);
-
-	dio->op.error = bch2_journal_error(&c->journal);
-
-	bch2_dio_write_done(dio);
-}
-
-static noinline void bch2_dio_write_flush(struct dio_write *dio)
-{
-	struct bch_fs *c = dio->op.c;
-	struct bch_inode_unpacked inode;
-	int ret;
-
-	dio->flush = 0;
-
-	closure_init(&dio->op.cl, NULL);
-
-	if (!dio->op.error) {
-		ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
-		if (ret) {
-			dio->op.error = ret;
-		} else {
-			bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq,
-						     &dio->op.cl);
-			bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl);
-		}
-	}
-
-	if (dio->sync) {
-		closure_sync(&dio->op.cl);
-		closure_debug_destroy(&dio->op.cl);
-	} else {
-		continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL);
-	}
-}
-
-static __always_inline long bch2_dio_write_done(struct dio_write *dio)
-{
-	struct kiocb *req = dio->req;
-	struct bch_inode_info *inode = dio->inode;
-	bool sync = dio->sync;
-	long ret;
-
-	if (unlikely(dio->flush)) {
-		bch2_dio_write_flush(dio);
-		if (!sync)
-			return -EIOCBQUEUED;
-	}
-
-	bch2_pagecache_block_put(inode);
-
-	kfree(dio->iov);
-
-	ret = dio->op.error ?: ((long) dio->written << 9);
-	bio_put(&dio->op.wbio.bio);
-
-	bch2_write_ref_put(dio->op.c, BCH_WRITE_REF_dio_write);
-
-	/* inode->i_dio_count is our ref on inode and thus bch_fs */
-	inode_dio_end(&inode->v);
-
-	if (ret < 0)
-		ret = bch2_err_class(ret);
-
-	if (!sync) {
-		req->ki_complete(req, ret);
-		ret = -EIOCBQUEUED;
-	}
-	return ret;
-}
-
-static __always_inline void bch2_dio_write_end(struct dio_write *dio)
-{
-	struct bch_fs *c = dio->op.c;
-	struct kiocb *req = dio->req;
-	struct bch_inode_info *inode = dio->inode;
-	struct bio *bio = &dio->op.wbio.bio;
-
-	req->ki_pos	+= (u64) dio->op.written << 9;
-	dio->written	+= dio->op.written;
-
-	if (dio->extending) {
-		spin_lock(&inode->v.i_lock);
-		if (req->ki_pos > inode->v.i_size)
-			i_size_write(&inode->v, req->ki_pos);
-		spin_unlock(&inode->v.i_lock);
-	}
-
-	if (dio->op.i_sectors_delta || dio->quota_res.sectors) {
-		mutex_lock(&inode->ei_quota_lock);
-		__bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
-		__bch2_quota_reservation_put(c, inode, &dio->quota_res);
-		mutex_unlock(&inode->ei_quota_lock);
-	}
-
-	bio_release_pages(bio, false);
-
-	if (unlikely(dio->op.error))
-		set_bit(EI_INODE_ERROR, &inode->ei_flags);
-}
-
-static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
-{
-	struct bch_fs *c = dio->op.c;
-	struct kiocb *req = dio->req;
-	struct address_space *mapping = dio->mapping;
-	struct bch_inode_info *inode = dio->inode;
-	struct bch_io_opts opts;
-	struct bio *bio = &dio->op.wbio.bio;
-	unsigned unaligned, iter_count;
-	bool sync = dio->sync, dropped_locks;
-	long ret;
-
-	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-	while (1) {
-		iter_count = dio->iter.count;
-
-		EBUG_ON(current->faults_disabled_mapping);
-		current->faults_disabled_mapping = mapping;
-
-		ret = bio_iov_iter_get_pages(bio, &dio->iter);
-
-		dropped_locks = fdm_dropped_locks();
-
-		current->faults_disabled_mapping = NULL;
-
-		/*
-		 * If the fault handler returned an error but also signalled
-		 * that it dropped & retook ei_pagecache_lock, we just need to
-		 * re-shoot down the page cache and retry:
-		 */
-		if (dropped_locks && ret)
-			ret = 0;
-
-		if (unlikely(ret < 0))
-			goto err;
-
-		if (unlikely(dropped_locks)) {
-			ret = bch2_write_invalidate_inode_pages_range(mapping,
-					req->ki_pos,
-					req->ki_pos + iter_count - 1);
-			if (unlikely(ret))
-				goto err;
-
-			if (!bio->bi_iter.bi_size)
-				continue;
-		}
-
-		unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
-		bio->bi_iter.bi_size -= unaligned;
-		iov_iter_revert(&dio->iter, unaligned);
-
-		if (!bio->bi_iter.bi_size) {
-			/*
-			 * bio_iov_iter_get_pages was only able to get <
-			 * blocksize worth of pages:
-			 */
-			ret = -EFAULT;
-			goto err;
-		}
-
-		bch2_write_op_init(&dio->op, c, opts);
-		dio->op.end_io		= sync
-			? NULL
-			: bch2_dio_write_loop_async;
-		dio->op.target		= dio->op.opts.foreground_target;
-		dio->op.write_point	= writepoint_hashed((unsigned long) current);
-		dio->op.nr_replicas	= dio->op.opts.data_replicas;
-		dio->op.subvol		= inode->ei_subvol;
-		dio->op.pos		= POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
-		dio->op.devs_need_flush	= &inode->ei_devs_need_flush;
-
-		if (sync)
-			dio->op.flags |= BCH_WRITE_SYNC;
-		dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
-
-		ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
-						 bio_sectors(bio), true);
-		if (unlikely(ret))
-			goto err;
-
-		ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
-						dio->op.opts.data_replicas, 0);
-		if (unlikely(ret) &&
-		    !bch2_dio_write_check_allocated(dio))
-			goto err;
-
-		task_io_account_write(bio->bi_iter.bi_size);
-
-		if (unlikely(dio->iter.count) &&
-		    !dio->sync &&
-		    !dio->loop &&
-		    bch2_dio_write_copy_iov(dio))
-			dio->sync = sync = true;
-
-		dio->loop = true;
-		closure_call(&dio->op.cl, bch2_write, NULL, NULL);
-
-		if (!sync)
-			return -EIOCBQUEUED;
-
-		bch2_dio_write_end(dio);
-
-		if (likely(!dio->iter.count) || dio->op.error)
-			break;
-
-		bio_reset(bio, NULL, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
-	}
-out:
-	return bch2_dio_write_done(dio);
-err:
-	dio->op.error = ret;
-
-	bio_release_pages(bio, false);
-
-	bch2_quota_reservation_put(c, inode, &dio->quota_res);
-	goto out;
-}
-
-static noinline __cold void bch2_dio_write_continue(struct dio_write *dio)
-{
-	struct mm_struct *mm = dio->mm;
-
-	bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
-
-	if (mm)
-		kthread_use_mm(mm);
-	bch2_dio_write_loop(dio);
-	if (mm)
-		kthread_unuse_mm(mm);
-}
-
-static void bch2_dio_write_loop_async(struct bch_write_op *op)
-{
-	struct dio_write *dio = container_of(op, struct dio_write, op);
-
-	bch2_dio_write_end(dio);
-
-	if (likely(!dio->iter.count) || dio->op.error)
-		bch2_dio_write_done(dio);
-	else
-		bch2_dio_write_continue(dio);
-}
-
-ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
-{
-	struct file *file = req->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct dio_write *dio;
-	struct bio *bio;
-	bool locked = true, extending;
-	ssize_t ret;
-
-	prefetch(&c->opts);
-	prefetch((void *) &c->opts + 64);
-	prefetch(&inode->ei_inode);
-	prefetch((void *) &inode->ei_inode + 64);
-
-	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_dio_write))
-		return -EROFS;
-
-	inode_lock(&inode->v);
-
-	ret = generic_write_checks(req, iter);
-	if (unlikely(ret <= 0))
-		goto err_put_write_ref;
-
-	ret = file_remove_privs(file);
-	if (unlikely(ret))
-		goto err_put_write_ref;
-
-	ret = file_update_time(file);
-	if (unlikely(ret))
-		goto err_put_write_ref;
-
-	if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
-		goto err_put_write_ref;
-
-	inode_dio_begin(&inode->v);
-	bch2_pagecache_block_get(inode);
-
-	extending = req->ki_pos + iter->count > inode->v.i_size;
-	if (!extending) {
-		inode_unlock(&inode->v);
-		locked = false;
-	}
-
-	bio = bio_alloc_bioset(NULL,
-			       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
-			       REQ_OP_WRITE | REQ_SYNC | REQ_IDLE,
-			       GFP_KERNEL,
-			       &c->dio_write_bioset);
-	dio = container_of(bio, struct dio_write, op.wbio.bio);
-	dio->req		= req;
-	dio->mapping		= mapping;
-	dio->inode		= inode;
-	dio->mm			= current->mm;
-	dio->iov		= NULL;
-	dio->loop		= false;
-	dio->extending		= extending;
-	dio->sync		= is_sync_kiocb(req) || extending;
-	dio->flush		= iocb_is_dsync(req) && !c->opts.journal_flush_disabled;
-	dio->quota_res.sectors	= 0;
-	dio->written		= 0;
-	dio->iter		= *iter;
-	dio->op.c		= c;
-
-	if (unlikely(mapping->nrpages)) {
-		ret = bch2_write_invalidate_inode_pages_range(mapping,
-						req->ki_pos,
-						req->ki_pos + iter->count - 1);
-		if (unlikely(ret))
-			goto err_put_bio;
-	}
-
-	ret = bch2_dio_write_loop(dio);
-out:
-	if (locked)
-		inode_unlock(&inode->v);
-	return ret;
-err_put_bio:
-	bch2_pagecache_block_put(inode);
-	bio_put(bio);
-	inode_dio_end(&inode->v);
-err_put_write_ref:
-	bch2_write_ref_put(c, BCH_WRITE_REF_dio_write);
-	goto out;
-}
-
-void bch2_fs_fs_io_direct_exit(struct bch_fs *c)
-{
-	bioset_exit(&c->dio_write_bioset);
-	bioset_exit(&c->dio_read_bioset);
-}
-
-int bch2_fs_fs_io_direct_init(struct bch_fs *c)
-{
-	if (bioset_init(&c->dio_read_bioset,
-			4, offsetof(struct dio_read, rbio.bio),
-			BIOSET_NEED_BVECS))
-		return -BCH_ERR_ENOMEM_dio_read_bioset_init;
-
-	if (bioset_init(&c->dio_write_bioset,
-			4, offsetof(struct dio_write, op.wbio.bio),
-			BIOSET_NEED_BVECS))
-		return -BCH_ERR_ENOMEM_dio_write_bioset_init;
-
-	return 0;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-direct.h b/fs/bcachefs/fs-io-direct.h
deleted file mode 100644
index 814621ec7f81..000000000000
--- a/fs/bcachefs/fs-io-direct.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IO_DIRECT_H
-#define _BCACHEFS_FS_IO_DIRECT_H
-
-#ifndef NO_BCACHEFS_FS
-ssize_t bch2_direct_write(struct kiocb *, struct iov_iter *);
-ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
-
-void bch2_fs_fs_io_direct_exit(struct bch_fs *);
-int bch2_fs_fs_io_direct_init(struct bch_fs *);
-#else
-static inline void bch2_fs_fs_io_direct_exit(struct bch_fs *c) {}
-static inline int bch2_fs_fs_io_direct_init(struct bch_fs *c) { return 0; }
-#endif
-
-#endif /* _BCACHEFS_FS_IO_DIRECT_H */
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
deleted file mode 100644
index 872283e5bd1e..000000000000
--- a/fs/bcachefs/fs-io-pagecache.c
+++ /dev/null
@@ -1,802 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "btree_iter.h"
-#include "extents.h"
-#include "fs-io.h"
-#include "fs-io-pagecache.h"
-#include "subvolume.h"
-
-#include <linux/pagevec.h>
-#include <linux/writeback.h>
-
-int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
-				     loff_t start, u64 end,
-				     fgf_t fgp_flags, gfp_t gfp,
-				     folios *fs)
-{
-	struct folio *f;
-	u64 pos = start;
-	int ret = 0;
-
-	while (pos < end) {
-		if ((u64) pos >= (u64) start + (1ULL << 20))
-			fgp_flags &= ~FGP_CREAT;
-
-		ret = darray_make_room_gfp(fs, 1, gfp & GFP_KERNEL);
-		if (ret)
-			break;
-
-		f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
-		if (IS_ERR_OR_NULL(f))
-			break;
-
-		BUG_ON(fs->nr && folio_pos(f) != pos);
-
-		pos = folio_end_pos(f);
-		darray_push(fs, f);
-	}
-
-	if (!fs->nr && !ret && (fgp_flags & FGP_CREAT))
-		ret = -ENOMEM;
-
-	return fs->nr ? 0 : ret;
-}
-
-/* pagecache_block must be held */
-int bch2_write_invalidate_inode_pages_range(struct address_space *mapping,
-					    loff_t start, loff_t end)
-{
-	int ret;
-
-	/*
-	 * XXX: the way this is currently implemented, we can spin if a process
-	 * is continually redirtying a specific page
-	 */
-	do {
-		if (!mapping->nrpages)
-			return 0;
-
-		ret = filemap_write_and_wait_range(mapping, start, end);
-		if (ret)
-			break;
-
-		if (!mapping->nrpages)
-			return 0;
-
-		ret = invalidate_inode_pages2_range(mapping,
-				start >> PAGE_SHIFT,
-				end >> PAGE_SHIFT);
-	} while (ret == -EBUSY);
-
-	return ret;
-}
-
-#if 0
-/* Useful for debug tracing: */
-static const char * const bch2_folio_sector_states[] = {
-#define x(n)	#n,
-	BCH_FOLIO_SECTOR_STATE()
-#undef x
-	NULL
-};
-#endif
-
-static inline enum bch_folio_sector_state
-folio_sector_dirty(enum bch_folio_sector_state state)
-{
-	switch (state) {
-	case SECTOR_unallocated:
-		return SECTOR_dirty;
-	case SECTOR_reserved:
-		return SECTOR_dirty_reserved;
-	default:
-		return state;
-	}
-}
-
-static inline enum bch_folio_sector_state
-folio_sector_undirty(enum bch_folio_sector_state state)
-{
-	switch (state) {
-	case SECTOR_dirty:
-		return SECTOR_unallocated;
-	case SECTOR_dirty_reserved:
-		return SECTOR_reserved;
-	default:
-		return state;
-	}
-}
-
-static inline enum bch_folio_sector_state
-folio_sector_reserve(enum bch_folio_sector_state state)
-{
-	switch (state) {
-	case SECTOR_unallocated:
-		return SECTOR_reserved;
-	case SECTOR_dirty:
-		return SECTOR_dirty_reserved;
-	default:
-		return state;
-	}
-}
-
-/* for newly allocated folios: */
-struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
-{
-	struct bch_folio *s;
-
-	s = kzalloc(sizeof(*s) +
-		    sizeof(struct bch_folio_sector) *
-		    folio_sectors(folio), gfp);
-	if (!s)
-		return NULL;
-
-	spin_lock_init(&s->lock);
-	folio_attach_private(folio, s);
-	return s;
-}
-
-struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp)
-{
-	return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp);
-}
-
-static unsigned bkey_to_sector_state(struct bkey_s_c k)
-{
-	if (bkey_extent_is_reservation(k))
-		return SECTOR_reserved;
-	if (bkey_extent_is_allocation(k.k))
-		return SECTOR_allocated;
-	return SECTOR_unallocated;
-}
-
-static void __bch2_folio_set(struct folio *folio,
-			     unsigned pg_offset, unsigned pg_len,
-			     unsigned nr_ptrs, unsigned state)
-{
-	struct bch_folio *s = bch2_folio(folio);
-	unsigned i, sectors = folio_sectors(folio);
-
-	BUG_ON(pg_offset >= sectors);
-	BUG_ON(pg_offset + pg_len > sectors);
-
-	spin_lock(&s->lock);
-
-	for (i = pg_offset; i < pg_offset + pg_len; i++) {
-		s->s[i].nr_replicas	= nr_ptrs;
-		bch2_folio_sector_set(folio, s, i, state);
-	}
-
-	if (i == sectors)
-		s->uptodate = true;
-
-	spin_unlock(&s->lock);
-}
-
-/*
- * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the
- * extents btree:
- */
-int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
-		   struct folio **fs, unsigned nr_folios)
-{
-	struct btree_trans *trans;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bch_folio *s;
-	u64 offset = folio_sector(fs[0]);
-	unsigned folio_idx;
-	u32 snapshot;
-	bool need_set = false;
-	int ret;
-
-	for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
-		s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
-		if (!s)
-			return -ENOMEM;
-
-		need_set |= !s->uptodate;
-	}
-
-	if (!need_set)
-		return 0;
-
-	folio_idx = 0;
-	trans = bch2_trans_get(c);
-retry:
-	bch2_trans_begin(trans);
-
-	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (ret)
-		goto err;
-
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
-			   SPOS(inum.inum, offset, snapshot),
-			   BTREE_ITER_slots, k, ret) {
-		unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
-		unsigned state = bkey_to_sector_state(k);
-
-		while (folio_idx < nr_folios) {
-			struct folio *folio = fs[folio_idx];
-			u64 folio_start	= folio_sector(folio);
-			u64 folio_end	= folio_end_sector(folio);
-			unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
-				folio_start;
-			unsigned folio_len = min(k.k->p.offset, folio_end) -
-				folio_offset - folio_start;
-
-			BUG_ON(k.k->p.offset < folio_start);
-			BUG_ON(bkey_start_offset(k.k) > folio_end);
-
-			if (!bch2_folio(folio)->uptodate)
-				__bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
-
-			if (k.k->p.offset < folio_end)
-				break;
-			folio_idx++;
-		}
-
-		if (folio_idx == nr_folios)
-			break;
-	}
-
-	offset = iter.pos.offset;
-	bch2_trans_iter_exit(trans, &iter);
-err:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-	bch2_trans_put(trans);
-
-	return ret;
-}
-
-void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
-{
-	struct bvec_iter iter;
-	struct folio_vec fv;
-	unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
-		? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
-	unsigned state = bkey_to_sector_state(k);
-
-	bio_for_each_folio(fv, bio, iter)
-		__bch2_folio_set(fv.fv_folio,
-				 fv.fv_offset >> 9,
-				 fv.fv_len >> 9,
-				 nr_ptrs, state);
-}
-
-void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode,
-				     u64 start, u64 end)
-{
-	pgoff_t index = start >> PAGE_SECTORS_SHIFT;
-	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
-	struct folio_batch fbatch;
-	unsigned i, j;
-
-	if (end <= start)
-		return;
-
-	folio_batch_init(&fbatch);
-
-	while (filemap_get_folios(inode->v.i_mapping,
-				  &index, end_index, &fbatch)) {
-		for (i = 0; i < folio_batch_count(&fbatch); i++) {
-			struct folio *folio = fbatch.folios[i];
-			u64 folio_start = folio_sector(folio);
-			u64 folio_end = folio_end_sector(folio);
-			unsigned folio_offset = max(start, folio_start) - folio_start;
-			unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
-			struct bch_folio *s;
-
-			BUG_ON(end <= folio_start);
-
-			folio_lock(folio);
-			s = bch2_folio(folio);
-
-			if (s) {
-				spin_lock(&s->lock);
-				for (j = folio_offset; j < folio_offset + folio_len; j++)
-					s->s[j].nr_replicas = 0;
-				spin_unlock(&s->lock);
-			}
-
-			folio_unlock(folio);
-		}
-		folio_batch_release(&fbatch);
-		cond_resched();
-	}
-}
-
-int bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
-				 u64 *start, u64 end,
-				 bool nonblocking)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	pgoff_t index = *start >> PAGE_SECTORS_SHIFT;
-	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
-	struct folio_batch fbatch;
-	s64 i_sectors_delta = 0;
-	int ret = 0;
-
-	if (end <= *start)
-		return 0;
-
-	folio_batch_init(&fbatch);
-
-	while (filemap_get_folios(inode->v.i_mapping,
-				  &index, end_index, &fbatch)) {
-		for (unsigned i = 0; i < folio_batch_count(&fbatch); i++) {
-			struct folio *folio = fbatch.folios[i];
-
-			if (!nonblocking)
-				folio_lock(folio);
-			else if (!folio_trylock(folio)) {
-				folio_batch_release(&fbatch);
-				ret = -EAGAIN;
-				break;
-			}
-
-			u64 folio_start = folio_sector(folio);
-			u64 folio_end = folio_end_sector(folio);
-
-			BUG_ON(end <= folio_start);
-
-			*start = min(end, folio_end);
-
-			struct bch_folio *s = bch2_folio(folio);
-			if (s) {
-				unsigned folio_offset = max(*start, folio_start) - folio_start;
-				unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
-
-				spin_lock(&s->lock);
-				for (unsigned j = folio_offset; j < folio_offset + folio_len; j++) {
-					i_sectors_delta -= s->s[j].state == SECTOR_dirty;
-					bch2_folio_sector_set(folio, s, j,
-						folio_sector_reserve(s->s[j].state));
-				}
-				spin_unlock(&s->lock);
-			}
-
-			folio_unlock(folio);
-		}
-		folio_batch_release(&fbatch);
-		cond_resched();
-	}
-
-	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
-	return ret;
-}
-
-static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
-					  unsigned nr_replicas)
-{
-	return max(0, (int) nr_replicas -
-		   s->nr_replicas -
-		   s->replicas_reserved);
-}
-
-int bch2_get_folio_disk_reservation(struct bch_fs *c,
-				struct bch_inode_info *inode,
-				struct folio *folio, bool check_enospc)
-{
-	struct bch_folio *s = bch2_folio_create(folio, 0);
-	unsigned nr_replicas = inode_nr_replicas(c, inode);
-	struct disk_reservation disk_res = { 0 };
-	unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0;
-	int ret;
-
-	if (!s)
-		return -ENOMEM;
-
-	for (i = 0; i < sectors; i++)
-		disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
-
-	if (!disk_res_sectors)
-		return 0;
-
-	ret = bch2_disk_reservation_get(c, &disk_res,
-					disk_res_sectors, 1,
-					!check_enospc
-					? BCH_DISK_RESERVATION_NOFAIL
-					: 0);
-	if (unlikely(ret))
-		return ret;
-
-	for (i = 0; i < sectors; i++)
-		s->s[i].replicas_reserved +=
-			sectors_to_reserve(&s->s[i], nr_replicas);
-
-	return 0;
-}
-
-void bch2_folio_reservation_put(struct bch_fs *c,
-			struct bch_inode_info *inode,
-			struct bch2_folio_reservation *res)
-{
-	bch2_disk_reservation_put(c, &res->disk);
-	bch2_quota_reservation_put(c, inode, &res->quota);
-}
-
-int bch2_folio_reservation_get(struct bch_fs *c,
-			struct bch_inode_info *inode,
-			struct folio *folio,
-			struct bch2_folio_reservation *res,
-			unsigned offset, unsigned len)
-{
-	struct bch_folio *s = bch2_folio_create(folio, 0);
-	unsigned i, disk_sectors = 0, quota_sectors = 0;
-	int ret;
-
-	if (!s)
-		return -ENOMEM;
-
-	BUG_ON(!s->uptodate);
-
-	for (i = round_down(offset, block_bytes(c)) >> 9;
-	     i < round_up(offset + len, block_bytes(c)) >> 9;
-	     i++) {
-		disk_sectors += sectors_to_reserve(&s->s[i],
-						res->disk.nr_replicas);
-		quota_sectors += s->s[i].state == SECTOR_unallocated;
-	}
-
-	if (disk_sectors) {
-		ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
-		if (unlikely(ret))
-			return ret;
-	}
-
-	if (quota_sectors) {
-		ret = bch2_quota_reservation_add(c, inode, &res->quota,
-						 quota_sectors, true);
-		if (unlikely(ret)) {
-			struct disk_reservation tmp = {
-				.sectors = disk_sectors
-			};
-
-			bch2_disk_reservation_put(c, &tmp);
-			res->disk.sectors -= disk_sectors;
-			return ret;
-		}
-	}
-
-	return 0;
-}
-
-static void bch2_clear_folio_bits(struct folio *folio)
-{
-	struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_folio *s = bch2_folio(folio);
-	struct disk_reservation disk_res = { 0 };
-	int i, sectors = folio_sectors(folio), dirty_sectors = 0;
-
-	if (!s)
-		return;
-
-	EBUG_ON(!folio_test_locked(folio));
-	EBUG_ON(folio_test_writeback(folio));
-
-	for (i = 0; i < sectors; i++) {
-		disk_res.sectors += s->s[i].replicas_reserved;
-		s->s[i].replicas_reserved = 0;
-
-		dirty_sectors -= s->s[i].state == SECTOR_dirty;
-		bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state));
-	}
-
-	bch2_disk_reservation_put(c, &disk_res);
-
-	bch2_i_sectors_acct(c, inode, NULL, dirty_sectors);
-
-	bch2_folio_release(folio);
-}
-
-void bch2_set_folio_dirty(struct bch_fs *c,
-			  struct bch_inode_info *inode,
-			  struct folio *folio,
-			  struct bch2_folio_reservation *res,
-			  unsigned offset, unsigned len)
-{
-	struct bch_folio *s = bch2_folio(folio);
-	unsigned i, dirty_sectors = 0;
-
-	WARN_ON((u64) folio_pos(folio) + offset + len >
-		round_up((u64) i_size_read(&inode->v), block_bytes(c)));
-
-	BUG_ON(!s->uptodate);
-
-	spin_lock(&s->lock);
-
-	for (i = round_down(offset, block_bytes(c)) >> 9;
-	     i < round_up(offset + len, block_bytes(c)) >> 9;
-	     i++) {
-		unsigned sectors = sectors_to_reserve(&s->s[i],
-						res->disk.nr_replicas);
-
-		/*
-		 * This can happen if we race with the error path in
-		 * bch2_writepage_io_done():
-		 */
-		sectors = min_t(unsigned, sectors, res->disk.sectors);
-
-		s->s[i].replicas_reserved += sectors;
-		res->disk.sectors -= sectors;
-
-		dirty_sectors += s->s[i].state == SECTOR_unallocated;
-
-		bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state));
-	}
-
-	spin_unlock(&s->lock);
-
-	bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors);
-
-	if (!folio_test_dirty(folio))
-		filemap_dirty_folio(inode->v.i_mapping, folio);
-}
-
-vm_fault_t bch2_page_fault(struct vm_fault *vmf)
-{
-	struct file *file = vmf->vma->vm_file;
-	struct address_space *mapping = file->f_mapping;
-	struct address_space *fdm = faults_disabled_mapping();
-	struct bch_inode_info *inode = file_bch_inode(file);
-	vm_fault_t ret;
-
-	if (fdm == mapping)
-		return VM_FAULT_SIGBUS;
-
-	/* Lock ordering: */
-	if (fdm > mapping) {
-		struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
-
-		if (bch2_pagecache_add_tryget(inode))
-			goto got_lock;
-
-		bch2_pagecache_block_put(fdm_host);
-
-		bch2_pagecache_add_get(inode);
-		bch2_pagecache_add_put(inode);
-
-		bch2_pagecache_block_get(fdm_host);
-
-		/* Signal that lock has been dropped: */
-		set_fdm_dropped_locks();
-		return VM_FAULT_SIGBUS;
-	}
-
-	bch2_pagecache_add_get(inode);
-got_lock:
-	ret = filemap_fault(vmf);
-	bch2_pagecache_add_put(inode);
-
-	return ret;
-}
-
-vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
-{
-	struct folio *folio = page_folio(vmf->page);
-	struct file *file = vmf->vma->vm_file;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct address_space *mapping = file->f_mapping;
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch2_folio_reservation res;
-	unsigned len;
-	loff_t isize;
-	vm_fault_t ret;
-
-	bch2_folio_reservation_init(c, inode, &res);
-
-	sb_start_pagefault(inode->v.i_sb);
-	file_update_time(file);
-
-	/*
-	 * Not strictly necessary, but helps avoid dio writes livelocking in
-	 * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get
-	 * a bch2_write_invalidate_inode_pages_range() that works without dropping
-	 * page lock before invalidating page
-	 */
-	bch2_pagecache_add_get(inode);
-
-	folio_lock(folio);
-	isize = i_size_read(&inode->v);
-
-	if (folio->mapping != mapping || folio_pos(folio) >= isize) {
-		folio_unlock(folio);
-		ret = VM_FAULT_NOPAGE;
-		goto out;
-	}
-
-	len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
-
-	if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
-	    bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
-		folio_unlock(folio);
-		ret = VM_FAULT_SIGBUS;
-		goto out;
-	}
-
-	bch2_set_folio_dirty(c, inode, folio, &res, 0, len);
-	bch2_folio_reservation_put(c, inode, &res);
-
-	folio_wait_stable(folio);
-	ret = VM_FAULT_LOCKED;
-out:
-	bch2_pagecache_add_put(inode);
-	sb_end_pagefault(inode->v.i_sb);
-
-	return ret;
-}
-
-void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
-{
-	if (offset || length < folio_size(folio))
-		return;
-
-	bch2_clear_folio_bits(folio);
-}
-
-bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
-{
-	if (folio_test_dirty(folio) || folio_test_writeback(folio))
-		return false;
-
-	bch2_clear_folio_bits(folio);
-	return true;
-}
-
-/* fseek: */
-
-static int folio_data_offset(struct folio *folio, loff_t pos,
-			     unsigned min_replicas)
-{
-	struct bch_folio *s = bch2_folio(folio);
-	unsigned i, sectors = folio_sectors(folio);
-
-	if (s)
-		for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
-			if (s->s[i].state >= SECTOR_dirty &&
-			    s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas)
-				return i << SECTOR_SHIFT;
-
-	return -1;
-}
-
-loff_t bch2_seek_pagecache_data(struct inode *vinode,
-				loff_t start_offset,
-				loff_t end_offset,
-				unsigned min_replicas,
-				bool nonblock)
-{
-	struct folio_batch fbatch;
-	pgoff_t start_index	= start_offset >> PAGE_SHIFT;
-	pgoff_t end_index	= end_offset >> PAGE_SHIFT;
-	pgoff_t index		= start_index;
-	unsigned i;
-	loff_t ret;
-	int offset;
-
-	folio_batch_init(&fbatch);
-
-	while (filemap_get_folios(vinode->i_mapping,
-				  &index, end_index, &fbatch)) {
-		for (i = 0; i < folio_batch_count(&fbatch); i++) {
-			struct folio *folio = fbatch.folios[i];
-
-			if (!nonblock) {
-				folio_lock(folio);
-			} else if (!folio_trylock(folio)) {
-				folio_batch_release(&fbatch);
-				return -EAGAIN;
-			}
-
-			offset = folio_data_offset(folio,
-					max(folio_pos(folio), start_offset),
-					min_replicas);
-			if (offset >= 0) {
-				ret = clamp(folio_pos(folio) + offset,
-					    start_offset, end_offset);
-				folio_unlock(folio);
-				folio_batch_release(&fbatch);
-				return ret;
-			}
-			folio_unlock(folio);
-		}
-		folio_batch_release(&fbatch);
-		cond_resched();
-	}
-
-	return end_offset;
-}
-
-/*
- * Search for a hole in a folio.
- *
- * The filemap layer returns -ENOENT if no folio exists, so reuse the same error
- * code to indicate a pagecache hole exists at the returned offset. Otherwise
- * return 0 if the folio is filled with data, or an error code. This function
- * can return -EAGAIN if nonblock is specified.
- */
-static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
-			      unsigned min_replicas, bool nonblock)
-{
-	struct folio *folio;
-	struct bch_folio *s;
-	unsigned i, sectors;
-	int ret = -ENOENT;
-
-	folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
-				    FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0);
-	if (IS_ERR(folio))
-		return PTR_ERR(folio);
-
-	s = bch2_folio(folio);
-	if (!s)
-		goto unlock;
-
-	sectors = folio_sectors(folio);
-	for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
-		if (s->s[i].state < SECTOR_dirty ||
-		    s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) {
-			*offset = max(*offset,
-				      folio_pos(folio) + (i << SECTOR_SHIFT));
-			goto unlock;
-		}
-
-	*offset = folio_end_pos(folio);
-	ret = 0;
-unlock:
-	folio_unlock(folio);
-	folio_put(folio);
-	return ret;
-}
-
-loff_t bch2_seek_pagecache_hole(struct inode *vinode,
-				loff_t start_offset,
-				loff_t end_offset,
-				unsigned min_replicas,
-				bool nonblock)
-{
-	struct address_space *mapping = vinode->i_mapping;
-	loff_t offset = start_offset;
-	loff_t ret = 0;
-
-	while (!ret && offset < end_offset)
-		ret = folio_hole_offset(mapping, &offset, min_replicas, nonblock);
-
-	if (ret && ret != -ENOENT)
-		return ret;
-	return min(offset, end_offset);
-}
-
-int bch2_clamp_data_hole(struct inode *inode,
-			 u64 *hole_start,
-			 u64 *hole_end,
-			 unsigned min_replicas,
-			 bool nonblock)
-{
-	loff_t ret;
-
-	ret = bch2_seek_pagecache_hole(inode,
-		*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
-	if (ret < 0)
-		return ret;
-
-	*hole_start = ret;
-
-	if (*hole_start == *hole_end)
-		return 0;
-
-	ret = bch2_seek_pagecache_data(inode,
-		*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
-	if (ret < 0)
-		return ret;
-
-	*hole_end = ret;
-	return 0;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
deleted file mode 100644
index 828c3d7c8f19..000000000000
--- a/fs/bcachefs/fs-io-pagecache.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IO_PAGECACHE_H
-#define _BCACHEFS_FS_IO_PAGECACHE_H
-
-#include <linux/pagemap.h>
-
-typedef DARRAY(struct folio *) folios;
-
-int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t,
-				     u64, fgf_t, gfp_t, folios *);
-int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t);
-
-/*
- * Use u64 for the end pos and sector helpers because if the folio covers the
- * max supported range of the mapping, the start offset of the next folio
- * overflows loff_t. This breaks much of the range based processing in the
- * buffered write path.
- */
-static inline u64 folio_end_pos(struct folio *folio)
-{
-	return folio_pos(folio) + folio_size(folio);
-}
-
-static inline size_t folio_sectors(struct folio *folio)
-{
-	return PAGE_SECTORS << folio_order(folio);
-}
-
-static inline loff_t folio_sector(struct folio *folio)
-{
-	return folio_pos(folio) >> 9;
-}
-
-static inline u64 folio_end_sector(struct folio *folio)
-{
-	return folio_end_pos(folio) >> 9;
-}
-
-#define BCH_FOLIO_SECTOR_STATE()	\
-	x(unallocated)			\
-	x(reserved)			\
-	x(dirty)			\
-	x(dirty_reserved)		\
-	x(allocated)
-
-enum bch_folio_sector_state {
-#define x(n)	SECTOR_##n,
-	BCH_FOLIO_SECTOR_STATE()
-#undef x
-};
-
-struct bch_folio_sector {
-	/* Uncompressed, fully allocated replicas (or on disk reservation): */
-	u8			nr_replicas:4,
-	/* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
-				replicas_reserved:4;
-	u8			state;
-};
-
-struct bch_folio {
-	spinlock_t		lock;
-	atomic_t		write_count;
-	/*
-	 * Is the sector state up to date with the btree?
-	 * (Not the data itself)
-	 */
-	bool			uptodate;
-	struct bch_folio_sector	s[];
-};
-
-/* Helper for when we need to add debug instrumentation: */
-static inline void bch2_folio_sector_set(struct folio *folio,
-			     struct bch_folio *s,
-			     unsigned i, unsigned n)
-{
-	s->s[i].state = n;
-}
-
-/* file offset (to folio offset) to bch_folio_sector index */
-static inline int folio_pos_to_s(struct folio *folio, loff_t pos)
-{
-	u64 f_offset = pos - folio_pos(folio);
-
-	BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio));
-	return f_offset >> SECTOR_SHIFT;
-}
-
-/* for newly allocated folios: */
-static inline void __bch2_folio_release(struct folio *folio)
-{
-	kfree(folio_detach_private(folio));
-}
-
-static inline void bch2_folio_release(struct folio *folio)
-{
-	EBUG_ON(!folio_test_locked(folio));
-	__bch2_folio_release(folio);
-}
-
-static inline struct bch_folio *__bch2_folio(struct folio *folio)
-{
-	return folio_has_private(folio)
-		? (struct bch_folio *) folio_get_private(folio)
-		: NULL;
-}
-
-static inline struct bch_folio *bch2_folio(struct folio *folio)
-{
-	EBUG_ON(!folio_test_locked(folio));
-
-	return __bch2_folio(folio);
-}
-
-struct bch_folio *__bch2_folio_create(struct folio *, gfp_t);
-struct bch_folio *bch2_folio_create(struct folio *, gfp_t);
-
-struct bch2_folio_reservation {
-	struct disk_reservation	disk;
-	struct quota_res	quota;
-};
-
-static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
-{
-	/* XXX: this should not be open coded */
-	return inode->ei_inode.bi_data_replicas
-		? inode->ei_inode.bi_data_replicas - 1
-		: c->opts.data_replicas;
-}
-
-static inline void bch2_folio_reservation_init(struct bch_fs *c,
-			struct bch_inode_info *inode,
-			struct bch2_folio_reservation *res)
-{
-	memset(res, 0, sizeof(*res));
-
-	res->disk.nr_replicas = inode_nr_replicas(c, inode);
-}
-
-int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned);
-void bch2_bio_page_state_set(struct bio *, struct bkey_s_c);
-
-void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64);
-int bch2_mark_pagecache_reserved(struct bch_inode_info *, u64 *, u64, bool);
-
-int bch2_get_folio_disk_reservation(struct bch_fs *,
-				struct bch_inode_info *,
-				struct folio *, bool);
-
-void bch2_folio_reservation_put(struct bch_fs *,
-			struct bch_inode_info *,
-			struct bch2_folio_reservation *);
-int bch2_folio_reservation_get(struct bch_fs *,
-			struct bch_inode_info *,
-			struct folio *,
-			struct bch2_folio_reservation *,
-			unsigned, unsigned);
-
-void bch2_set_folio_dirty(struct bch_fs *,
-			  struct bch_inode_info *,
-			  struct folio *,
-			  struct bch2_folio_reservation *,
-			  unsigned, unsigned);
-
-vm_fault_t bch2_page_fault(struct vm_fault *);
-vm_fault_t bch2_page_mkwrite(struct vm_fault *);
-void bch2_invalidate_folio(struct folio *, size_t, size_t);
-bool bch2_release_folio(struct folio *, gfp_t);
-
-loff_t bch2_seek_pagecache_data(struct inode *, loff_t, loff_t, unsigned, bool);
-loff_t bch2_seek_pagecache_hole(struct inode *, loff_t, loff_t, unsigned, bool);
-int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool);
-
-#endif /* _BCACHEFS_FS_IO_PAGECACHE_H */
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
deleted file mode 100644
index ef20b64033e0..000000000000
--- a/fs/bcachefs/fs-io.c
+++ /dev/null
@@ -1,1084 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "clock.h"
-#include "error.h"
-#include "extents.h"
-#include "extent_update.h"
-#include "fs.h"
-#include "fs-io.h"
-#include "fs-io-buffered.h"
-#include "fs-io-pagecache.h"
-#include "fsck.h"
-#include "inode.h"
-#include "journal.h"
-#include "io_misc.h"
-#include "keylist.h"
-#include "quota.h"
-#include "reflink.h"
-#include "trace.h"
-
-#include <linux/aio.h>
-#include <linux/backing-dev.h>
-#include <linux/falloc.h>
-#include <linux/migrate.h>
-#include <linux/mmu_context.h>
-#include <linux/pagevec.h>
-#include <linux/rmap.h>
-#include <linux/sched/signal.h>
-#include <linux/task_io_accounting_ops.h>
-#include <linux/uio.h>
-
-#include <trace/events/writeback.h>
-
-struct nocow_flush {
-	struct closure	*cl;
-	struct bch_dev	*ca;
-	struct bio	bio;
-};
-
-static void nocow_flush_endio(struct bio *_bio)
-{
-
-	struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
-
-	closure_put(bio->cl);
-	percpu_ref_put(&bio->ca->io_ref);
-	bio_put(&bio->bio);
-}
-
-void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
-					 struct bch_inode_info *inode,
-					 struct closure *cl)
-{
-	struct nocow_flush *bio;
-	struct bch_dev *ca;
-	struct bch_devs_mask devs;
-	unsigned dev;
-
-	dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX);
-	if (dev == BCH_SB_MEMBERS_MAX)
-		return;
-
-	devs = inode->ei_devs_need_flush;
-	memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
-
-	for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
-		rcu_read_lock();
-		ca = rcu_dereference(c->devs[dev]);
-		if (ca && !percpu_ref_tryget(&ca->io_ref))
-			ca = NULL;
-		rcu_read_unlock();
-
-		if (!ca)
-			continue;
-
-		bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
-						    REQ_OP_WRITE|REQ_PREFLUSH,
-						    GFP_KERNEL,
-						    &c->nocow_flush_bioset),
-				   struct nocow_flush, bio);
-		bio->cl			= cl;
-		bio->ca			= ca;
-		bio->bio.bi_end_io	= nocow_flush_endio;
-		closure_bio_submit(&bio->bio, cl);
-	}
-}
-
-static int bch2_inode_flush_nocow_writes(struct bch_fs *c,
-					 struct bch_inode_info *inode)
-{
-	struct closure cl;
-
-	closure_init_stack(&cl);
-	bch2_inode_flush_nocow_writes_async(c, inode, &cl);
-	closure_sync(&cl);
-
-	return 0;
-}
-
-/* i_size updates: */
-
-struct inode_new_size {
-	loff_t		new_size;
-	u64		now;
-	unsigned	fields;
-};
-
-static int inode_set_size(struct btree_trans *trans,
-			  struct bch_inode_info *inode,
-			  struct bch_inode_unpacked *bi,
-			  void *p)
-{
-	struct inode_new_size *s = p;
-
-	bi->bi_size = s->new_size;
-	if (s->fields & ATTR_ATIME)
-		bi->bi_atime = s->now;
-	if (s->fields & ATTR_MTIME)
-		bi->bi_mtime = s->now;
-	if (s->fields & ATTR_CTIME)
-		bi->bi_ctime = s->now;
-
-	return 0;
-}
-
-int __must_check bch2_write_inode_size(struct bch_fs *c,
-				       struct bch_inode_info *inode,
-				       loff_t new_size, unsigned fields)
-{
-	struct inode_new_size s = {
-		.new_size	= new_size,
-		.now		= bch2_current_time(c),
-		.fields		= fields,
-	};
-
-	return bch2_write_inode(c, inode, inode_set_size, &s, fields);
-}
-
-void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
-			   struct quota_res *quota_res, s64 sectors)
-{
-	bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
-				"inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
-				inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
-				inode->ei_inode.bi_sectors);
-	inode->v.i_blocks += sectors;
-
-#ifdef CONFIG_BCACHEFS_QUOTA
-	if (quota_res &&
-	    !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) &&
-	    sectors > 0) {
-		BUG_ON(sectors > quota_res->sectors);
-		BUG_ON(sectors > inode->ei_quota_reserved);
-
-		quota_res->sectors -= sectors;
-		inode->ei_quota_reserved -= sectors;
-	} else {
-		bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
-	}
-#endif
-}
-
-/* fsync: */
-
-/*
- * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an
- * insert trigger: look up the btree inode instead
- */
-static int bch2_flush_inode(struct bch_fs *c,
-			    struct bch_inode_info *inode)
-{
-	if (c->opts.journal_flush_disabled)
-		return 0;
-
-	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync))
-		return -EROFS;
-
-	struct bch_inode_unpacked u;
-	int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?:
-		  bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
-		  bch2_inode_flush_nocow_writes(c, inode);
-	bch2_write_ref_put(c, BCH_WRITE_REF_fsync);
-	return ret;
-}
-
-int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
-{
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	int ret;
-
-	ret = file_write_and_wait_range(file, start, end);
-	if (ret)
-		goto out;
-	ret = sync_inode_metadata(&inode->v, 1);
-	if (ret)
-		goto out;
-	ret = bch2_flush_inode(c, inode);
-out:
-	ret = bch2_err_class(ret);
-	if (ret == -EROFS)
-		ret = -EIO;
-	return ret;
-}
-
-/* truncate: */
-
-static inline int range_has_data(struct bch_fs *c, u32 subvol,
-				 struct bpos start,
-				 struct bpos end)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret = 0;
-retry:
-	bch2_trans_begin(trans);
-
-	ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot);
-	if (ret)
-		goto err;
-
-	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
-		if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) {
-			ret = 1;
-			break;
-		}
-	start = iter.pos;
-	bch2_trans_iter_exit(trans, &iter);
-err:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	bch2_trans_put(trans);
-	return ret;
-}
-
-static int __bch2_truncate_folio(struct bch_inode_info *inode,
-				 pgoff_t index, loff_t start, loff_t end)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct address_space *mapping = inode->v.i_mapping;
-	struct bch_folio *s;
-	unsigned start_offset;
-	unsigned end_offset;
-	unsigned i;
-	struct folio *folio;
-	s64 i_sectors_delta = 0;
-	int ret = 0;
-	u64 end_pos;
-
-	folio = filemap_lock_folio(mapping, index);
-	if (IS_ERR_OR_NULL(folio)) {
-		/*
-		 * XXX: we're doing two index lookups when we end up reading the
-		 * folio
-		 */
-		ret = range_has_data(c, inode->ei_subvol,
-				POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)),
-				POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS));
-		if (ret <= 0)
-			return ret;
-
-		folio = __filemap_get_folio(mapping, index,
-					    FGP_LOCK|FGP_CREAT, GFP_KERNEL);
-		if (IS_ERR_OR_NULL(folio)) {
-			ret = -ENOMEM;
-			goto out;
-		}
-	}
-
-	BUG_ON(start	>= folio_end_pos(folio));
-	BUG_ON(end	<= folio_pos(folio));
-
-	start_offset	= max(start, folio_pos(folio)) - folio_pos(folio);
-	end_offset	= min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio);
-
-	/* Folio boundary? Nothing to do */
-	if (start_offset == 0 &&
-	    end_offset == folio_size(folio)) {
-		ret = 0;
-		goto unlock;
-	}
-
-	s = bch2_folio_create(folio, 0);
-	if (!s) {
-		ret = -ENOMEM;
-		goto unlock;
-	}
-
-	if (!folio_test_uptodate(folio)) {
-		ret = bch2_read_single_folio(folio, mapping);
-		if (ret)
-			goto unlock;
-	}
-
-	ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
-	if (ret)
-		goto unlock;
-
-	for (i = round_up(start_offset, block_bytes(c)) >> 9;
-	     i < round_down(end_offset, block_bytes(c)) >> 9;
-	     i++) {
-		s->s[i].nr_replicas	= 0;
-
-		i_sectors_delta -= s->s[i].state == SECTOR_dirty;
-		bch2_folio_sector_set(folio, s, i, SECTOR_unallocated);
-	}
-
-	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
-
-	/*
-	 * Caller needs to know whether this folio will be written out by
-	 * writeback - doing an i_size update if necessary - or whether it will
-	 * be responsible for the i_size update.
-	 *
-	 * Note that we shouldn't ever see a folio beyond EOF, but check and
-	 * warn if so. This has been observed by failure to clean up folios
-	 * after a short write and there's still a chance reclaim will fix
-	 * things up.
-	 */
-	WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size);
-	end_pos = folio_end_pos(folio);
-	if (inode->v.i_size > folio_pos(folio))
-		end_pos = min_t(u64, inode->v.i_size, end_pos);
-	ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty;
-
-	folio_zero_segment(folio, start_offset, end_offset);
-
-	/*
-	 * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
-	 *
-	 * XXX: because we aren't currently tracking whether the folio has actual
-	 * data in it (vs. just 0s, or only partially written) this wrong. ick.
-	 */
-	BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false));
-
-	/*
-	 * This removes any writeable userspace mappings; we need to force
-	 * .page_mkwrite to be called again before any mmapped writes, to
-	 * redirty the full page:
-	 */
-	folio_mkclean(folio);
-	filemap_dirty_folio(mapping, folio);
-unlock:
-	folio_unlock(folio);
-	folio_put(folio);
-out:
-	return ret;
-}
-
-static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from)
-{
-	return __bch2_truncate_folio(inode, from >> PAGE_SHIFT,
-				     from, ANYSINT_MAX(loff_t));
-}
-
-static int bch2_truncate_folios(struct bch_inode_info *inode,
-				loff_t start, loff_t end)
-{
-	int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT,
-					start, end);
-
-	if (ret >= 0 &&
-	    start >> PAGE_SHIFT != end >> PAGE_SHIFT)
-		ret = __bch2_truncate_folio(inode,
-					(end - 1) >> PAGE_SHIFT,
-					start, end);
-	return ret;
-}
-
-static int bch2_extend(struct mnt_idmap *idmap,
-		       struct bch_inode_info *inode,
-		       struct bch_inode_unpacked *inode_u,
-		       struct iattr *iattr)
-{
-	struct address_space *mapping = inode->v.i_mapping;
-	int ret;
-
-	/*
-	 * sync appends:
-	 *
-	 * this has to be done _before_ extending i_size:
-	 */
-	ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX);
-	if (ret)
-		return ret;
-
-	truncate_setsize(&inode->v, iattr->ia_size);
-
-	return bch2_setattr_nonsize(idmap, inode, iattr);
-}
-
-int bchfs_truncate(struct mnt_idmap *idmap,
-		  struct bch_inode_info *inode, struct iattr *iattr)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct address_space *mapping = inode->v.i_mapping;
-	struct bch_inode_unpacked inode_u;
-	s64 i_sectors_delta = 0;
-	int ret = 0;
-
-	/*
-	 * If the truncate call with change the size of the file, the
-	 * cmtimes should be updated. If the size will not change, we
-	 * do not need to update the cmtimes.
-	 */
-	if (iattr->ia_size != inode->v.i_size) {
-		if (!(iattr->ia_valid & ATTR_MTIME))
-			ktime_get_coarse_real_ts64(&iattr->ia_mtime);
-		if (!(iattr->ia_valid & ATTR_CTIME))
-			ktime_get_coarse_real_ts64(&iattr->ia_ctime);
-		iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME;
-	}
-
-	inode_dio_wait(&inode->v);
-	bch2_pagecache_block_get(inode);
-
-	ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u);
-	if (ret)
-		goto err;
-
-	/*
-	 * check this before next assertion; on filesystem error our normal
-	 * invariants are a bit broken (truncate has to truncate the page cache
-	 * before the inode).
-	 */
-	ret = bch2_journal_error(&c->journal);
-	if (ret)
-		goto err;
-
-	WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) &&
-		  inode->v.i_size < inode_u.bi_size,
-		  "truncate spotted in mem i_size < btree i_size: %llu < %llu\n",
-		  (u64) inode->v.i_size, inode_u.bi_size);
-
-	if (iattr->ia_size > inode->v.i_size) {
-		ret = bch2_extend(idmap, inode, &inode_u, iattr);
-		goto err;
-	}
-
-	iattr->ia_valid &= ~ATTR_SIZE;
-
-	ret = bch2_truncate_folio(inode, iattr->ia_size);
-	if (unlikely(ret < 0))
-		goto err;
-
-	truncate_setsize(&inode->v, iattr->ia_size);
-
-	/*
-	 * When extending, we're going to write the new i_size to disk
-	 * immediately so we need to flush anything above the current on disk
-	 * i_size first:
-	 *
-	 * Also, when extending we need to flush the page that i_size currently
-	 * straddles - if it's mapped to userspace, we need to ensure that
-	 * userspace has to redirty it and call .mkwrite -> set_page_dirty
-	 * again to allocate the part of the page that was extended.
-	 */
-	if (iattr->ia_size > inode_u.bi_size)
-		ret = filemap_write_and_wait_range(mapping,
-				inode_u.bi_size,
-				iattr->ia_size - 1);
-	else if (iattr->ia_size & (PAGE_SIZE - 1))
-		ret = filemap_write_and_wait_range(mapping,
-				round_down(iattr->ia_size, PAGE_SIZE),
-				iattr->ia_size - 1);
-	if (ret)
-		goto err;
-
-	ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta);
-	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
-
-	if (unlikely(ret)) {
-		/*
-		 * If we error here, VFS caches are now inconsistent with btree
-		 */
-		set_bit(EI_INODE_ERROR, &inode->ei_flags);
-		goto err;
-	}
-
-	bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
-				!bch2_journal_error(&c->journal), c,
-				"inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
-				inode->v.i_ino, (u64) inode->v.i_blocks,
-				inode->ei_inode.bi_sectors);
-
-	ret = bch2_setattr_nonsize(idmap, inode, iattr);
-err:
-	bch2_pagecache_block_put(inode);
-	return bch2_err_class(ret);
-}
-
-/* fallocate: */
-
-static int inode_update_times_fn(struct btree_trans *trans,
-				 struct bch_inode_info *inode,
-				 struct bch_inode_unpacked *bi, void *p)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
-	bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
-	return 0;
-}
-
-static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	u64 end		= offset + len;
-	u64 block_start	= round_up(offset, block_bytes(c));
-	u64 block_end	= round_down(end, block_bytes(c));
-	bool truncated_last_page;
-	int ret = 0;
-
-	ret = bch2_truncate_folios(inode, offset, end);
-	if (unlikely(ret < 0))
-		goto err;
-
-	truncated_last_page = ret;
-
-	truncate_pagecache_range(&inode->v, offset, end - 1);
-
-	if (block_start < block_end) {
-		s64 i_sectors_delta = 0;
-
-		ret = bch2_fpunch(c, inode_inum(inode),
-				  block_start >> 9, block_end >> 9,
-				  &i_sectors_delta);
-		bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
-	}
-
-	mutex_lock(&inode->ei_update_lock);
-	if (end >= inode->v.i_size && !truncated_last_page) {
-		ret = bch2_write_inode_size(c, inode, inode->v.i_size,
-					    ATTR_MTIME|ATTR_CTIME);
-	} else {
-		ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
-				       ATTR_MTIME|ATTR_CTIME);
-	}
-	mutex_unlock(&inode->ei_update_lock);
-err:
-	return ret;
-}
-
-static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
-				   loff_t offset, loff_t len,
-				   bool insert)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct address_space *mapping = inode->v.i_mapping;
-	s64 i_sectors_delta = 0;
-	int ret = 0;
-
-	if ((offset | len) & (block_bytes(c) - 1))
-		return -EINVAL;
-
-	if (insert) {
-		if (offset >= inode->v.i_size)
-			return -EINVAL;
-	} else {
-		if (offset + len >= inode->v.i_size)
-			return -EINVAL;
-	}
-
-	ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
-	if (ret)
-		return ret;
-
-	if (insert)
-		i_size_write(&inode->v, inode->v.i_size + len);
-
-	ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9,
-				     insert, &i_sectors_delta);
-	if (!ret && !insert)
-		i_size_write(&inode->v, inode->v.i_size - len);
-	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
-
-	return ret;
-}
-
-static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
-			     u64 start_sector, u64 end_sector)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter;
-	struct bpos end_pos = POS(inode->v.i_ino, end_sector);
-	struct bch_io_opts opts;
-	int ret = 0;
-
-	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-			POS(inode->v.i_ino, start_sector),
-			BTREE_ITER_slots|BTREE_ITER_intent);
-
-	while (!ret && bkey_lt(iter.pos, end_pos)) {
-		s64 i_sectors_delta = 0;
-		struct quota_res quota_res = { 0 };
-		struct bkey_s_c k;
-		unsigned sectors;
-		bool is_allocation;
-		u64 hole_start, hole_end;
-		u32 snapshot;
-
-		bch2_trans_begin(trans);
-
-		ret = bch2_subvolume_get_snapshot(trans,
-					inode->ei_subvol, &snapshot);
-		if (ret)
-			goto bkey_err;
-
-		bch2_btree_iter_set_snapshot(&iter, snapshot);
-
-		k = bch2_btree_iter_peek_slot(&iter);
-		if ((ret = bkey_err(k)))
-			goto bkey_err;
-
-		hole_start	= iter.pos.offset;
-		hole_end	= bpos_min(k.k->p, end_pos).offset;
-		is_allocation	= bkey_extent_is_allocation(k.k);
-
-		/* already reserved */
-		if (bkey_extent_is_reservation(k) &&
-		    bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) {
-			bch2_btree_iter_advance(&iter);
-			continue;
-		}
-
-		if (bkey_extent_is_data(k.k) &&
-		    !(mode & FALLOC_FL_ZERO_RANGE)) {
-			bch2_btree_iter_advance(&iter);
-			continue;
-		}
-
-		if (!(mode & FALLOC_FL_ZERO_RANGE)) {
-			/*
-			 * Lock ordering - can't be holding btree locks while
-			 * blocking on a folio lock:
-			 */
-			if (bch2_clamp_data_hole(&inode->v,
-						 &hole_start,
-						 &hole_end,
-						 opts.data_replicas, true))
-				ret = drop_locks_do(trans,
-					(bch2_clamp_data_hole(&inode->v,
-							      &hole_start,
-							      &hole_end,
-							      opts.data_replicas, false), 0));
-			bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start));
-
-			if (ret)
-				goto bkey_err;
-
-			if (hole_start == hole_end)
-				continue;
-		}
-
-		sectors	= hole_end - hole_start;
-
-		if (!is_allocation) {
-			ret = bch2_quota_reservation_add(c, inode,
-					&quota_res, sectors, true);
-			if (unlikely(ret))
-				goto bkey_err;
-		}
-
-		ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter,
-					    sectors, opts, &i_sectors_delta,
-					    writepoint_hashed((unsigned long) current));
-		if (ret)
-			goto bkey_err;
-
-		bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
-
-		if (bch2_mark_pagecache_reserved(inode, &hole_start,
-						 iter.pos.offset, true))
-			drop_locks_do(trans,
-				bch2_mark_pagecache_reserved(inode, &hole_start,
-							     iter.pos.offset, false));
-bkey_err:
-		bch2_quota_reservation_put(c, inode, &quota_res);
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			ret = 0;
-	}
-
-	if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) {
-		struct quota_res quota_res = { 0 };
-		s64 i_sectors_delta = 0;
-
-		bch2_fpunch_at(trans, &iter, inode_inum(inode),
-			       end_sector, &i_sectors_delta);
-		bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
-		bch2_quota_reservation_put(c, inode, &quota_res);
-	}
-
-	bch2_trans_iter_exit(trans, &iter);
-	bch2_trans_put(trans);
-	return ret;
-}
-
-static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
-			    loff_t offset, loff_t len)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	u64 end		= offset + len;
-	u64 block_start	= round_down(offset,	block_bytes(c));
-	u64 block_end	= round_up(end,		block_bytes(c));
-	bool truncated_last_page = false;
-	int ret, ret2 = 0;
-
-	if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
-		ret = inode_newsize_ok(&inode->v, end);
-		if (ret)
-			return ret;
-	}
-
-	if (mode & FALLOC_FL_ZERO_RANGE) {
-		ret = bch2_truncate_folios(inode, offset, end);
-		if (unlikely(ret < 0))
-			return ret;
-
-		truncated_last_page = ret;
-
-		truncate_pagecache_range(&inode->v, offset, end - 1);
-
-		block_start	= round_up(offset,	block_bytes(c));
-		block_end	= round_down(end,	block_bytes(c));
-	}
-
-	ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9);
-
-	/*
-	 * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update,
-	 * so that the VFS cache i_size is consistent with the btree i_size:
-	 */
-	if (ret &&
-	    !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)))
-		return ret;
-
-	if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size)
-		end = inode->v.i_size;
-
-	if (end >= inode->v.i_size &&
-	    (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) ||
-	     !(mode & FALLOC_FL_KEEP_SIZE))) {
-		spin_lock(&inode->v.i_lock);
-		i_size_write(&inode->v, end);
-		spin_unlock(&inode->v.i_lock);
-
-		mutex_lock(&inode->ei_update_lock);
-		ret2 = bch2_write_inode_size(c, inode, end, 0);
-		mutex_unlock(&inode->ei_update_lock);
-	}
-
-	return ret ?: ret2;
-}
-
-long bch2_fallocate_dispatch(struct file *file, int mode,
-			     loff_t offset, loff_t len)
-{
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	long ret;
-
-	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate))
-		return -EROFS;
-
-	inode_lock(&inode->v);
-	inode_dio_wait(&inode->v);
-	bch2_pagecache_block_get(inode);
-
-	ret = file_modified(file);
-	if (ret)
-		goto err;
-
-	if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
-		ret = bchfs_fallocate(inode, mode, offset, len);
-	else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
-		ret = bchfs_fpunch(inode, offset, len);
-	else if (mode == FALLOC_FL_INSERT_RANGE)
-		ret = bchfs_fcollapse_finsert(inode, offset, len, true);
-	else if (mode == FALLOC_FL_COLLAPSE_RANGE)
-		ret = bchfs_fcollapse_finsert(inode, offset, len, false);
-	else
-		ret = -EOPNOTSUPP;
-err:
-	bch2_pagecache_block_put(inode);
-	inode_unlock(&inode->v);
-	bch2_write_ref_put(c, BCH_WRITE_REF_fallocate);
-
-	return bch2_err_class(ret);
-}
-
-/*
- * Take a quota reservation for unallocated blocks in a given file range
- * Does not check pagecache
- */
-static int quota_reserve_range(struct bch_inode_info *inode,
-			       struct quota_res *res,
-			       u64 start, u64 end)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	u32 snapshot;
-	u64 sectors = end - start;
-	u64 pos = start;
-	int ret;
-retry:
-	bch2_trans_begin(trans);
-
-	ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
-	if (ret)
-		goto err;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-			     SPOS(inode->v.i_ino, pos, snapshot), 0);
-
-	while (!(ret = btree_trans_too_many_iters(trans)) &&
-	       (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k &&
-	       !(ret = bkey_err(k))) {
-		if (bkey_extent_is_allocation(k.k)) {
-			u64 s = min(end, k.k->p.offset) -
-				max(start, bkey_start_offset(k.k));
-			BUG_ON(s > sectors);
-			sectors -= s;
-		}
-		bch2_btree_iter_advance(&iter);
-	}
-	pos = iter.pos.offset;
-	bch2_trans_iter_exit(trans, &iter);
-err:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	bch2_trans_put(trans);
-
-	return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true);
-}
-
-loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
-			     struct file *file_dst, loff_t pos_dst,
-			     loff_t len, unsigned remap_flags)
-{
-	struct bch_inode_info *src = file_bch_inode(file_src);
-	struct bch_inode_info *dst = file_bch_inode(file_dst);
-	struct bch_fs *c = src->v.i_sb->s_fs_info;
-	struct quota_res quota_res = { 0 };
-	s64 i_sectors_delta = 0;
-	u64 aligned_len;
-	loff_t ret = 0;
-
-	if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
-		return -EINVAL;
-
-	if (remap_flags & REMAP_FILE_DEDUP)
-		return -EOPNOTSUPP;
-
-	if ((pos_src & (block_bytes(c) - 1)) ||
-	    (pos_dst & (block_bytes(c) - 1)))
-		return -EINVAL;
-
-	if (src == dst &&
-	    abs(pos_src - pos_dst) < len)
-		return -EINVAL;
-
-	lock_two_nondirectories(&src->v, &dst->v);
-	bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
-
-	inode_dio_wait(&src->v);
-	inode_dio_wait(&dst->v);
-
-	ret = generic_remap_file_range_prep(file_src, pos_src,
-					    file_dst, pos_dst,
-					    &len, remap_flags);
-	if (ret < 0 || len == 0)
-		goto err;
-
-	aligned_len = round_up((u64) len, block_bytes(c));
-
-	ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping,
-				pos_dst, pos_dst + len - 1);
-	if (ret)
-		goto err;
-
-	ret = quota_reserve_range(dst, &quota_res, pos_dst >> 9,
-				  (pos_dst + aligned_len) >> 9);
-	if (ret)
-		goto err;
-
-	file_update_time(file_dst);
-
-	bch2_mark_pagecache_unallocated(src, pos_src >> 9,
-				   (pos_src + aligned_len) >> 9);
-
-	ret = bch2_remap_range(c,
-			       inode_inum(dst), pos_dst >> 9,
-			       inode_inum(src), pos_src >> 9,
-			       aligned_len >> 9,
-			       pos_dst + len, &i_sectors_delta);
-	if (ret < 0)
-		goto err;
-
-	/*
-	 * due to alignment, we might have remapped slightly more than requsted
-	 */
-	ret = min((u64) ret << 9, (u64) len);
-
-	bch2_i_sectors_acct(c, dst, &quota_res, i_sectors_delta);
-
-	spin_lock(&dst->v.i_lock);
-	if (pos_dst + ret > dst->v.i_size)
-		i_size_write(&dst->v, pos_dst + ret);
-	spin_unlock(&dst->v.i_lock);
-
-	if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
-	    IS_SYNC(file_inode(file_dst)))
-		ret = bch2_flush_inode(c, dst);
-err:
-	bch2_quota_reservation_put(c, dst, &quota_res);
-	bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
-	unlock_two_nondirectories(&src->v, &dst->v);
-
-	return bch2_err_class(ret);
-}
-
-/* fseek: */
-
-static loff_t bch2_seek_data(struct file *file, u64 offset)
-{
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans *trans;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	subvol_inum inum = inode_inum(inode);
-	u64 isize, next_data = MAX_LFS_FILESIZE;
-	u32 snapshot;
-	int ret;
-
-	isize = i_size_read(&inode->v);
-	if (offset >= isize)
-		return -ENXIO;
-
-	trans = bch2_trans_get(c);
-retry:
-	bch2_trans_begin(trans);
-
-	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (ret)
-		goto err;
-
-	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents,
-			   SPOS(inode->v.i_ino, offset >> 9, snapshot),
-			   POS(inode->v.i_ino, U64_MAX),
-			   0, k, ret) {
-		if (bkey_extent_is_data(k.k)) {
-			next_data = max(offset, bkey_start_offset(k.k) << 9);
-			break;
-		} else if (k.k->p.offset >> 9 > isize)
-			break;
-	}
-	bch2_trans_iter_exit(trans, &iter);
-err:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	bch2_trans_put(trans);
-	if (ret)
-		return ret;
-
-	if (next_data > offset)
-		next_data = bch2_seek_pagecache_data(&inode->v,
-					offset, next_data, 0, false);
-
-	if (next_data >= isize)
-		return -ENXIO;
-
-	return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
-}
-
-static loff_t bch2_seek_hole(struct file *file, u64 offset)
-{
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans *trans;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	subvol_inum inum = inode_inum(inode);
-	u64 isize, next_hole = MAX_LFS_FILESIZE;
-	u32 snapshot;
-	int ret;
-
-	isize = i_size_read(&inode->v);
-	if (offset >= isize)
-		return -ENXIO;
-
-	trans = bch2_trans_get(c);
-retry:
-	bch2_trans_begin(trans);
-
-	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (ret)
-		goto err;
-
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
-			   SPOS(inode->v.i_ino, offset >> 9, snapshot),
-			   BTREE_ITER_slots, k, ret) {
-		if (k.k->p.inode != inode->v.i_ino) {
-			next_hole = bch2_seek_pagecache_hole(&inode->v,
-					offset, MAX_LFS_FILESIZE, 0, false);
-			break;
-		} else if (!bkey_extent_is_data(k.k)) {
-			next_hole = bch2_seek_pagecache_hole(&inode->v,
-					max(offset, bkey_start_offset(k.k) << 9),
-					k.k->p.offset << 9, 0, false);
-
-			if (next_hole < k.k->p.offset << 9)
-				break;
-		} else {
-			offset = max(offset, bkey_start_offset(k.k) << 9);
-		}
-	}
-	bch2_trans_iter_exit(trans, &iter);
-err:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	bch2_trans_put(trans);
-	if (ret)
-		return ret;
-
-	if (next_hole > isize)
-		next_hole = isize;
-
-	return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE);
-}
-
-loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
-{
-	loff_t ret;
-
-	switch (whence) {
-	case SEEK_SET:
-	case SEEK_CUR:
-	case SEEK_END:
-		ret = generic_file_llseek(file, offset, whence);
-		break;
-	case SEEK_DATA:
-		ret = bch2_seek_data(file, offset);
-		break;
-	case SEEK_HOLE:
-		ret = bch2_seek_hole(file, offset);
-		break;
-	default:
-		ret = -EINVAL;
-		break;
-	}
-
-	return bch2_err_class(ret);
-}
-
-void bch2_fs_fsio_exit(struct bch_fs *c)
-{
-	bioset_exit(&c->nocow_flush_bioset);
-}
-
-int bch2_fs_fsio_init(struct bch_fs *c)
-{
-	if (bioset_init(&c->nocow_flush_bioset,
-			1, offsetof(struct nocow_flush, bio), 0))
-		return -BCH_ERR_ENOMEM_nocow_flush_bioset_init;
-
-	return 0;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
deleted file mode 100644
index ca70346e68dc..000000000000
--- a/fs/bcachefs/fs-io.h
+++ /dev/null
@@ -1,184 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IO_H
-#define _BCACHEFS_FS_IO_H
-
-#ifndef NO_BCACHEFS_FS
-
-#include "buckets.h"
-#include "fs.h"
-#include "io_write_types.h"
-#include "quota.h"
-
-#include <linux/uio.h>
-
-struct folio_vec {
-	struct folio	*fv_folio;
-	size_t		fv_offset;
-	size_t		fv_len;
-};
-
-static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv)
-{
-
-	struct folio *folio	= page_folio(bv.bv_page);
-	size_t offset		= (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) +
-		bv.bv_offset;
-	size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len);
-
-	return (struct folio_vec) {
-		.fv_folio	= folio,
-		.fv_offset	= offset,
-		.fv_len		= len,
-	};
-}
-
-static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio,
-						    struct bvec_iter iter)
-{
-	return biovec_to_foliovec(bio_iter_iovec(bio, iter));
-}
-
-#define __bio_for_each_folio(bvl, bio, iter, start)			\
-	for (iter = (start);						\
-	     (iter).bi_size &&						\
-		((bvl = bio_iter_iovec_folio((bio), (iter))), 1);	\
-	     bio_advance_iter_single((bio), &(iter), (bvl).fv_len))
-
-/**
- * bio_for_each_folio - iterate over folios within a bio
- *
- * Like other non-_all versions, this iterates over what bio->bi_iter currently
- * points to. This version is for drivers, where the bio may have previously
- * been split or cloned.
- */
-#define bio_for_each_folio(bvl, bio, iter)				\
-	__bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter)
-
-struct quota_res {
-	u64				sectors;
-};
-
-#ifdef CONFIG_BCACHEFS_QUOTA
-
-static inline void __bch2_quota_reservation_put(struct bch_fs *c,
-					 struct bch_inode_info *inode,
-					 struct quota_res *res)
-{
-	BUG_ON(res->sectors > inode->ei_quota_reserved);
-
-	bch2_quota_acct(c, inode->ei_qid, Q_SPC,
-			-((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
-	inode->ei_quota_reserved -= res->sectors;
-	res->sectors = 0;
-}
-
-static inline void bch2_quota_reservation_put(struct bch_fs *c,
-				       struct bch_inode_info *inode,
-				       struct quota_res *res)
-{
-	if (res->sectors) {
-		mutex_lock(&inode->ei_quota_lock);
-		__bch2_quota_reservation_put(c, inode, res);
-		mutex_unlock(&inode->ei_quota_lock);
-	}
-}
-
-static inline int bch2_quota_reservation_add(struct bch_fs *c,
-				      struct bch_inode_info *inode,
-				      struct quota_res *res,
-				      u64 sectors,
-				      bool check_enospc)
-{
-	int ret;
-
-	if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags))
-		return 0;
-
-	mutex_lock(&inode->ei_quota_lock);
-	ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
-			      check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
-	if (likely(!ret)) {
-		inode->ei_quota_reserved += sectors;
-		res->sectors += sectors;
-	}
-	mutex_unlock(&inode->ei_quota_lock);
-
-	return ret;
-}
-
-#else
-
-static inline void __bch2_quota_reservation_put(struct bch_fs *c,
-					 struct bch_inode_info *inode,
-					 struct quota_res *res) {}
-
-static inline void bch2_quota_reservation_put(struct bch_fs *c,
-				       struct bch_inode_info *inode,
-				       struct quota_res *res) {}
-
-static inline int bch2_quota_reservation_add(struct bch_fs *c,
-				      struct bch_inode_info *inode,
-				      struct quota_res *res,
-				      unsigned sectors,
-				      bool check_enospc)
-{
-	return 0;
-}
-
-#endif
-
-void __bch2_i_sectors_acct(struct bch_fs *, struct bch_inode_info *,
-			   struct quota_res *, s64);
-
-static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
-				       struct quota_res *quota_res, s64 sectors)
-{
-	if (sectors) {
-		mutex_lock(&inode->ei_quota_lock);
-		__bch2_i_sectors_acct(c, inode, quota_res, sectors);
-		mutex_unlock(&inode->ei_quota_lock);
-	}
-}
-
-static inline struct address_space *faults_disabled_mapping(void)
-{
-	return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
-}
-
-static inline void set_fdm_dropped_locks(void)
-{
-	current->faults_disabled_mapping =
-		(void *) (((unsigned long) current->faults_disabled_mapping)|1);
-}
-
-static inline bool fdm_dropped_locks(void)
-{
-	return ((unsigned long) current->faults_disabled_mapping) & 1;
-}
-
-void bch2_inode_flush_nocow_writes_async(struct bch_fs *,
-			struct bch_inode_info *, struct closure *);
-
-int __must_check bch2_write_inode_size(struct bch_fs *,
-				       struct bch_inode_info *,
-				       loff_t, unsigned);
-
-int bch2_fsync(struct file *, loff_t, loff_t, int);
-
-int bchfs_truncate(struct mnt_idmap *,
-		  struct bch_inode_info *, struct iattr *);
-long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
-
-loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
-			     loff_t, loff_t, unsigned);
-
-loff_t bch2_llseek(struct file *, loff_t, int);
-
-void bch2_fs_fsio_exit(struct bch_fs *);
-int bch2_fs_fsio_init(struct bch_fs *);
-#else
-static inline void bch2_fs_fsio_exit(struct bch_fs *c) {}
-static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; }
-#endif
-
-#endif /* _BCACHEFS_FS_IO_H */
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
deleted file mode 100644
index 205a323ffc6d..000000000000
--- a/fs/bcachefs/fs-ioctl.c
+++ /dev/null
@@ -1,564 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "chardev.h"
-#include "dirent.h"
-#include "fs.h"
-#include "fs-common.h"
-#include "fs-ioctl.h"
-#include "quota.h"
-
-#include <linux/compat.h>
-#include <linux/fsnotify.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <linux/security.h>
-#include <linux/writeback.h>
-
-#define FS_IOC_GOINGDOWN	     _IOR('X', 125, __u32)
-#define FSOP_GOING_FLAGS_DEFAULT	0x0	/* going down */
-#define FSOP_GOING_FLAGS_LOGFLUSH	0x1	/* flush log but not data */
-#define FSOP_GOING_FLAGS_NOLOGFLUSH	0x2	/* don't flush log nor data */
-
-struct flags_set {
-	unsigned		mask;
-	unsigned		flags;
-
-	unsigned		projid;
-
-	bool			set_projinherit;
-	bool			projinherit;
-};
-
-static int bch2_inode_flags_set(struct btree_trans *trans,
-				struct bch_inode_info *inode,
-				struct bch_inode_unpacked *bi,
-				void *p)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	/*
-	 * We're relying on btree locking here for exclusion with other ioctl
-	 * calls - use the flags in the btree (@bi), not inode->i_flags:
-	 */
-	struct flags_set *s = p;
-	unsigned newflags = s->flags;
-	unsigned oldflags = bi->bi_flags & s->mask;
-
-	if (((newflags ^ oldflags) & (BCH_INODE_append|BCH_INODE_immutable)) &&
-	    !capable(CAP_LINUX_IMMUTABLE))
-		return -EPERM;
-
-	if (!S_ISREG(bi->bi_mode) &&
-	    !S_ISDIR(bi->bi_mode) &&
-	    (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags)
-		return -EINVAL;
-
-	if (s->set_projinherit) {
-		bi->bi_fields_set &= ~(1 << Inode_opt_project);
-		bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project);
-	}
-
-	bi->bi_flags &= ~s->mask;
-	bi->bi_flags |= newflags;
-
-	bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v));
-	return 0;
-}
-
-static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg)
-{
-	unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags);
-
-	return put_user(flags, arg);
-}
-
-static int bch2_ioc_setflags(struct bch_fs *c,
-			     struct file *file,
-			     struct bch_inode_info *inode,
-			     void __user *arg)
-{
-	struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) };
-	unsigned uflags;
-	int ret;
-
-	if (get_user(uflags, (int __user *) arg))
-		return -EFAULT;
-
-	s.flags = map_flags_rev(bch_flags_to_uflags, uflags);
-	if (uflags)
-		return -EOPNOTSUPP;
-
-	ret = mnt_want_write_file(file);
-	if (ret)
-		return ret;
-
-	inode_lock(&inode->v);
-	if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
-		ret = -EACCES;
-		goto setflags_out;
-	}
-
-	mutex_lock(&inode->ei_update_lock);
-	ret   = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
-		bch2_write_inode(c, inode, bch2_inode_flags_set, &s,
-			       ATTR_CTIME);
-	mutex_unlock(&inode->ei_update_lock);
-
-setflags_out:
-	inode_unlock(&inode->v);
-	mnt_drop_write_file(file);
-	return ret;
-}
-
-static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
-			       struct fsxattr __user *arg)
-{
-	struct fsxattr fa = { 0 };
-
-	fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
-
-	if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project))
-		fa.fsx_xflags |= FS_XFLAG_PROJINHERIT;
-
-	fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
-
-	if (copy_to_user(arg, &fa, sizeof(fa)))
-		return -EFAULT;
-
-	return 0;
-}
-
-static int fssetxattr_inode_update_fn(struct btree_trans *trans,
-				      struct bch_inode_info *inode,
-				      struct bch_inode_unpacked *bi,
-				      void *p)
-{
-	struct flags_set *s = p;
-
-	if (s->projid != bi->bi_project) {
-		bi->bi_fields_set |= 1U << Inode_opt_project;
-		bi->bi_project = s->projid;
-	}
-
-	return bch2_inode_flags_set(trans, inode, bi, p);
-}
-
-static int bch2_ioc_fssetxattr(struct bch_fs *c,
-			       struct file *file,
-			       struct bch_inode_info *inode,
-			       struct fsxattr __user *arg)
-{
-	struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) };
-	struct fsxattr fa;
-	int ret;
-
-	if (copy_from_user(&fa, arg, sizeof(fa)))
-		return -EFAULT;
-
-	s.set_projinherit = true;
-	s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0;
-	fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT;
-
-	s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags);
-	if (fa.fsx_xflags)
-		return -EOPNOTSUPP;
-
-	if (fa.fsx_projid >= U32_MAX)
-		return -EINVAL;
-
-	/*
-	 * inode fields accessible via the xattr interface are stored with a +1
-	 * bias, so that 0 means unset:
-	 */
-	s.projid = fa.fsx_projid + 1;
-
-	ret = mnt_want_write_file(file);
-	if (ret)
-		return ret;
-
-	inode_lock(&inode->v);
-	if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
-		ret = -EACCES;
-		goto err;
-	}
-
-	mutex_lock(&inode->ei_update_lock);
-	ret   = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
-		bch2_set_projid(c, inode, fa.fsx_projid) ?:
-		bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
-			       ATTR_CTIME);
-	mutex_unlock(&inode->ei_update_lock);
-err:
-	inode_unlock(&inode->v);
-	mnt_drop_write_file(file);
-	return ret;
-}
-
-static int bch2_reinherit_attrs_fn(struct btree_trans *trans,
-				   struct bch_inode_info *inode,
-				   struct bch_inode_unpacked *bi,
-				   void *p)
-{
-	struct bch_inode_info *dir = p;
-
-	return !bch2_reinherit_attrs(bi, &dir->ei_inode);
-}
-
-static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
-				    struct file *file,
-				    struct bch_inode_info *src,
-				    const char __user *name)
-{
-	struct bch_hash_info hash = bch2_hash_info_init(c, &src->ei_inode);
-	struct bch_inode_info *dst;
-	struct inode *vinode = NULL;
-	char *kname = NULL;
-	struct qstr qstr;
-	int ret = 0;
-	subvol_inum inum;
-
-	kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
-	if (!kname)
-		return -ENOMEM;
-
-	ret = strncpy_from_user(kname, name, BCH_NAME_MAX);
-	if (unlikely(ret < 0))
-		goto err1;
-
-	qstr.len	= ret;
-	qstr.name	= kname;
-
-	ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum);
-	if (ret)
-		goto err1;
-
-	vinode = bch2_vfs_inode_get(c, inum);
-	ret = PTR_ERR_OR_ZERO(vinode);
-	if (ret)
-		goto err1;
-
-	dst = to_bch_ei(vinode);
-
-	ret = mnt_want_write_file(file);
-	if (ret)
-		goto err2;
-
-	bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst);
-
-	if (inode_attr_changing(src, dst, Inode_opt_project)) {
-		ret = bch2_fs_quota_transfer(c, dst,
-					     src->ei_qid,
-					     1 << QTYP_PRJ,
-					     KEY_TYPE_QUOTA_PREALLOC);
-		if (ret)
-			goto err3;
-	}
-
-	ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0);
-err3:
-	bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst);
-
-	/* return true if we did work */
-	if (ret >= 0)
-		ret = !ret;
-
-	mnt_drop_write_file(file);
-err2:
-	iput(vinode);
-err1:
-	kfree(kname);
-
-	return ret;
-}
-
-static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
-{
-	u32 flags;
-	int ret = 0;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if (get_user(flags, arg))
-		return -EFAULT;
-
-	bch_notice(c, "shutdown by ioctl type %u", flags);
-
-	switch (flags) {
-	case FSOP_GOING_FLAGS_DEFAULT:
-		ret = bdev_freeze(c->vfs_sb->s_bdev);
-		if (ret)
-			break;
-		bch2_journal_flush(&c->journal);
-		bch2_fs_emergency_read_only(c);
-		bdev_thaw(c->vfs_sb->s_bdev);
-		break;
-	case FSOP_GOING_FLAGS_LOGFLUSH:
-		bch2_journal_flush(&c->journal);
-		fallthrough;
-	case FSOP_GOING_FLAGS_NOLOGFLUSH:
-		bch2_fs_emergency_read_only(c);
-		break;
-	default:
-		ret = -EINVAL;
-		break;
-	}
-
-	return ret;
-}
-
-static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
-					  struct bch_ioctl_subvolume arg)
-{
-	struct inode *dir;
-	struct bch_inode_info *inode;
-	struct user_namespace *s_user_ns;
-	struct dentry *dst_dentry;
-	struct path src_path, dst_path;
-	int how = LOOKUP_FOLLOW;
-	int error;
-	subvol_inum snapshot_src = { 0 };
-	unsigned lookup_flags = 0;
-	unsigned create_flags = BCH_CREATE_SUBVOL;
-
-	if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE|
-			  BCH_SUBVOL_SNAPSHOT_RO))
-		return -EINVAL;
-
-	if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
-	    (arg.src_ptr ||
-	     (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)))
-		return -EINVAL;
-
-	if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
-		create_flags |= BCH_CREATE_SNAPSHOT;
-
-	if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
-		create_flags |= BCH_CREATE_SNAPSHOT_RO;
-
-	if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) {
-		/* sync_inodes_sb enforce s_umount is locked */
-		down_read(&c->vfs_sb->s_umount);
-		sync_inodes_sb(c->vfs_sb);
-		up_read(&c->vfs_sb->s_umount);
-	}
-retry:
-	if (arg.src_ptr) {
-		error = user_path_at(arg.dirfd,
-				(const char __user *)(unsigned long)arg.src_ptr,
-				how, &src_path);
-		if (error)
-			goto err1;
-
-		if (src_path.dentry->d_sb->s_fs_info != c) {
-			path_put(&src_path);
-			error = -EXDEV;
-			goto err1;
-		}
-
-		snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode));
-	}
-
-	dst_dentry = user_path_create(arg.dirfd,
-			(const char __user *)(unsigned long)arg.dst_ptr,
-			&dst_path, lookup_flags);
-	error = PTR_ERR_OR_ZERO(dst_dentry);
-	if (error)
-		goto err2;
-
-	if (dst_dentry->d_sb->s_fs_info != c) {
-		error = -EXDEV;
-		goto err3;
-	}
-
-	if (dst_dentry->d_inode) {
-		error = -EEXIST;
-		goto err3;
-	}
-
-	dir = dst_path.dentry->d_inode;
-	if (IS_DEADDIR(dir)) {
-		error = -BCH_ERR_ENOENT_directory_dead;
-		goto err3;
-	}
-
-	s_user_ns = dir->i_sb->s_user_ns;
-	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
-	    !kgid_has_mapping(s_user_ns, current_fsgid())) {
-		error = -EOVERFLOW;
-		goto err3;
-	}
-
-	error = inode_permission(file_mnt_idmap(filp),
-				 dir, MAY_WRITE | MAY_EXEC);
-	if (error)
-		goto err3;
-
-	if (!IS_POSIXACL(dir))
-		arg.mode &= ~current_umask();
-
-	error = security_path_mkdir(&dst_path, dst_dentry, arg.mode);
-	if (error)
-		goto err3;
-
-	if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
-	    !arg.src_ptr)
-		snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol;
-
-	inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir),
-			      dst_dentry, arg.mode|S_IFDIR,
-			      0, snapshot_src, create_flags);
-	error = PTR_ERR_OR_ZERO(inode);
-	if (error)
-		goto err3;
-
-	d_instantiate(dst_dentry, &inode->v);
-	fsnotify_mkdir(dir, dst_dentry);
-err3:
-	done_path_create(&dst_path, dst_dentry);
-err2:
-	if (arg.src_ptr)
-		path_put(&src_path);
-
-	if (retry_estale(error, lookup_flags)) {
-		lookup_flags |= LOOKUP_REVAL;
-		goto retry;
-	}
-err1:
-	return error;
-}
-
-static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
-					struct bch_ioctl_subvolume arg)
-{
-	down_write(&c->snapshot_create_lock);
-	long ret = __bch2_ioctl_subvolume_create(c, filp, arg);
-	up_write(&c->snapshot_create_lock);
-
-	return ret;
-}
-
-static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
-				struct bch_ioctl_subvolume arg)
-{
-	const char __user *name = (void __user *)(unsigned long)arg.dst_ptr;
-	struct path path;
-	struct inode *dir;
-	struct dentry *victim;
-	int ret = 0;
-
-	if (arg.flags)
-		return -EINVAL;
-
-	victim = user_path_locked_at(arg.dirfd, name, &path);
-	if (IS_ERR(victim))
-		return PTR_ERR(victim);
-
-	dir = d_inode(path.dentry);
-	if (victim->d_sb->s_fs_info != c) {
-		ret = -EXDEV;
-		goto err;
-	}
-	if (!d_is_positive(victim)) {
-		ret = -ENOENT;
-		goto err;
-	}
-	ret = __bch2_unlink(dir, victim, true);
-	if (!ret) {
-		fsnotify_rmdir(dir, victim);
-		d_delete(victim);
-	}
-err:
-	inode_unlock(dir);
-	dput(victim);
-	path_put(&path);
-	return ret;
-}
-
-long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
-{
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	long ret;
-
-	switch (cmd) {
-	case FS_IOC_GETFLAGS:
-		ret = bch2_ioc_getflags(inode, (int __user *) arg);
-		break;
-
-	case FS_IOC_SETFLAGS:
-		ret = bch2_ioc_setflags(c, file, inode, (int __user *) arg);
-		break;
-
-	case FS_IOC_FSGETXATTR:
-		ret = bch2_ioc_fsgetxattr(inode, (void __user *) arg);
-		break;
-
-	case FS_IOC_FSSETXATTR:
-		ret = bch2_ioc_fssetxattr(c, file, inode,
-					  (void __user *) arg);
-		break;
-
-	case BCHFS_IOC_REINHERIT_ATTRS:
-		ret = bch2_ioc_reinherit_attrs(c, file, inode,
-					       (void __user *) arg);
-		break;
-
-	case FS_IOC_GETVERSION:
-		ret = -ENOTTY;
-		break;
-
-	case FS_IOC_SETVERSION:
-		ret = -ENOTTY;
-		break;
-
-	case FS_IOC_GOINGDOWN:
-		ret = bch2_ioc_goingdown(c, (u32 __user *) arg);
-		break;
-
-	case BCH_IOCTL_SUBVOLUME_CREATE: {
-		struct bch_ioctl_subvolume i;
-
-		ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
-			? -EFAULT
-			: bch2_ioctl_subvolume_create(c, file, i);
-		break;
-	}
-
-	case BCH_IOCTL_SUBVOLUME_DESTROY: {
-		struct bch_ioctl_subvolume i;
-
-		ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
-			? -EFAULT
-			: bch2_ioctl_subvolume_destroy(c, file, i);
-		break;
-	}
-
-	default:
-		ret = bch2_fs_ioctl(c, cmd, (void __user *) arg);
-		break;
-	}
-
-	return bch2_err_class(ret);
-}
-
-#ifdef CONFIG_COMPAT
-long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
-{
-	/* These are just misnamed, they actually get/put from/to user an int */
-	switch (cmd) {
-	case FS_IOC32_GETFLAGS:
-		cmd = FS_IOC_GETFLAGS;
-		break;
-	case FS_IOC32_SETFLAGS:
-		cmd = FS_IOC_SETFLAGS;
-		break;
-	default:
-		return -ENOIOCTLCMD;
-	}
-	return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
-}
-#endif
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h
deleted file mode 100644
index d30f9bb056fd..000000000000
--- a/fs/bcachefs/fs-ioctl.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IOCTL_H
-#define _BCACHEFS_FS_IOCTL_H
-
-/* Inode flags: */
-
-/* bcachefs inode flags -> vfs inode flags: */
-static const __maybe_unused unsigned bch_flags_to_vfs[] = {
-	[__BCH_INODE_sync]	= S_SYNC,
-	[__BCH_INODE_immutable]	= S_IMMUTABLE,
-	[__BCH_INODE_append]	= S_APPEND,
-	[__BCH_INODE_noatime]	= S_NOATIME,
-};
-
-/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
-static const __maybe_unused unsigned bch_flags_to_uflags[] = {
-	[__BCH_INODE_sync]	= FS_SYNC_FL,
-	[__BCH_INODE_immutable]	= FS_IMMUTABLE_FL,
-	[__BCH_INODE_append]	= FS_APPEND_FL,
-	[__BCH_INODE_nodump]	= FS_NODUMP_FL,
-	[__BCH_INODE_noatime]	= FS_NOATIME_FL,
-};
-
-/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
-static const __maybe_unused unsigned bch_flags_to_xflags[] = {
-	[__BCH_INODE_sync]	= FS_XFLAG_SYNC,
-	[__BCH_INODE_immutable]	= FS_XFLAG_IMMUTABLE,
-	[__BCH_INODE_append]	= FS_XFLAG_APPEND,
-	[__BCH_INODE_nodump]	= FS_XFLAG_NODUMP,
-	[__BCH_INODE_noatime]	= FS_XFLAG_NOATIME,
-	//[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
-};
-
-#define set_flags(_map, _in, _out)					\
-do {									\
-	unsigned _i;							\
-									\
-	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\
-		if ((_in) & (1 << _i))					\
-			(_out) |= _map[_i];				\
-		else							\
-			(_out) &= ~_map[_i];				\
-} while (0)
-
-#define map_flags(_map, _in)						\
-({									\
-	unsigned _out = 0;						\
-									\
-	set_flags(_map, _in, _out);					\
-	_out;								\
-})
-
-#define map_flags_rev(_map, _in)					\
-({									\
-	unsigned _i, _out = 0;						\
-									\
-	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\
-		if ((_in) & _map[_i]) {					\
-			(_out) |= 1 << _i;				\
-			(_in) &= ~_map[_i];				\
-		}							\
-	(_out);								\
-})
-
-#define map_defined(_map)						\
-({									\
-	unsigned _in = ~0;						\
-									\
-	map_flags_rev(_map, _in);					\
-})
-
-/* Set VFS inode flags from bcachefs inode: */
-static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
-{
-	set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
-}
-
-long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
-long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);
-
-#endif /* _BCACHEFS_FS_IOCTL_H */
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
deleted file mode 100644
index fd851f10d11c..000000000000
--- a/fs/bcachefs/fs.c
+++ /dev/null
@@ -1,2064 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "acl.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "chardev.h"
-#include "dirent.h"
-#include "errcode.h"
-#include "extents.h"
-#include "fs.h"
-#include "fs-common.h"
-#include "fs-io.h"
-#include "fs-ioctl.h"
-#include "fs-io-buffered.h"
-#include "fs-io-direct.h"
-#include "fs-io-pagecache.h"
-#include "fsck.h"
-#include "inode.h"
-#include "io_read.h"
-#include "journal.h"
-#include "keylist.h"
-#include "quota.h"
-#include "snapshot.h"
-#include "super.h"
-#include "xattr.h"
-
-#include <linux/aio.h>
-#include <linux/backing-dev.h>
-#include <linux/exportfs.h>
-#include <linux/fiemap.h>
-#include <linux/module.h>
-#include <linux/pagemap.h>
-#include <linux/posix_acl.h>
-#include <linux/random.h>
-#include <linux/seq_file.h>
-#include <linux/statfs.h>
-#include <linux/string.h>
-#include <linux/xattr.h>
-
-static struct kmem_cache *bch2_inode_cache;
-
-static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
-				struct bch_inode_info *,
-				struct bch_inode_unpacked *,
-				struct bch_subvolume *);
-
-void bch2_inode_update_after_write(struct btree_trans *trans,
-				   struct bch_inode_info *inode,
-				   struct bch_inode_unpacked *bi,
-				   unsigned fields)
-{
-	struct bch_fs *c = trans->c;
-
-	BUG_ON(bi->bi_inum != inode->v.i_ino);
-
-	bch2_assert_pos_locked(trans, BTREE_ID_inodes,
-			       POS(0, bi->bi_inum),
-			       c->opts.inodes_use_key_cache);
-
-	set_nlink(&inode->v, bch2_inode_nlink_get(bi));
-	i_uid_write(&inode->v, bi->bi_uid);
-	i_gid_write(&inode->v, bi->bi_gid);
-	inode->v.i_mode	= bi->bi_mode;
-
-	if (fields & ATTR_ATIME)
-		inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
-	if (fields & ATTR_MTIME)
-		inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime));
-	if (fields & ATTR_CTIME)
-		inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
-
-	inode->ei_inode		= *bi;
-
-	bch2_inode_flags_to_vfs(inode);
-}
-
-int __must_check bch2_write_inode(struct bch_fs *c,
-				  struct bch_inode_info *inode,
-				  inode_set_fn set,
-				  void *p, unsigned fields)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter = { NULL };
-	struct bch_inode_unpacked inode_u;
-	int ret;
-retry:
-	bch2_trans_begin(trans);
-
-	ret   = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
-				BTREE_ITER_intent) ?:
-		(set ? set(trans, inode, &inode_u, p) : 0) ?:
-		bch2_inode_write(trans, &iter, &inode_u) ?:
-		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-
-	/*
-	 * the btree node lock protects inode->ei_inode, not ei_update_lock;
-	 * this is important for inode updates via bchfs_write_index_update
-	 */
-	if (!ret)
-		bch2_inode_update_after_write(trans, inode, &inode_u, fields);
-
-	bch2_trans_iter_exit(trans, &iter);
-
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
-			     "%s: inode %u:%llu not found when updating",
-			     bch2_err_str(ret),
-			     inode_inum(inode).subvol,
-			     inode_inum(inode).inum);
-
-	bch2_trans_put(trans);
-	return ret < 0 ? ret : 0;
-}
-
-int bch2_fs_quota_transfer(struct bch_fs *c,
-			   struct bch_inode_info *inode,
-			   struct bch_qid new_qid,
-			   unsigned qtypes,
-			   enum quota_acct_mode mode)
-{
-	unsigned i;
-	int ret;
-
-	qtypes &= enabled_qtypes(c);
-
-	for (i = 0; i < QTYP_NR; i++)
-		if (new_qid.q[i] == inode->ei_qid.q[i])
-			qtypes &= ~(1U << i);
-
-	if (!qtypes)
-		return 0;
-
-	mutex_lock(&inode->ei_quota_lock);
-
-	ret = bch2_quota_transfer(c, qtypes, new_qid,
-				  inode->ei_qid,
-				  inode->v.i_blocks +
-				  inode->ei_quota_reserved,
-				  mode);
-	if (!ret)
-		for (i = 0; i < QTYP_NR; i++)
-			if (qtypes & (1 << i))
-				inode->ei_qid.q[i] = new_qid.q[i];
-
-	mutex_unlock(&inode->ei_quota_lock);
-
-	return ret;
-}
-
-static int bch2_iget5_test(struct inode *vinode, void *p)
-{
-	struct bch_inode_info *inode = to_bch_ei(vinode);
-	subvol_inum *inum = p;
-
-	return inode->ei_subvol == inum->subvol &&
-		inode->ei_inode.bi_inum == inum->inum;
-}
-
-static int bch2_iget5_set(struct inode *vinode, void *p)
-{
-	struct bch_inode_info *inode = to_bch_ei(vinode);
-	subvol_inum *inum = p;
-
-	inode->v.i_ino		= inum->inum;
-	inode->ei_subvol	= inum->subvol;
-	inode->ei_inode.bi_inum	= inum->inum;
-	return 0;
-}
-
-static unsigned bch2_inode_hash(subvol_inum inum)
-{
-	return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
-}
-
-static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
-{
-	subvol_inum inum = inode_inum(inode);
-	struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v,
-				      bch2_inode_hash(inum),
-				      bch2_iget5_test,
-				      bch2_iget5_set,
-				      &inum));
-	BUG_ON(!old);
-
-	if (unlikely(old != inode)) {
-		__destroy_inode(&inode->v);
-		kmem_cache_free(bch2_inode_cache, inode);
-		inode = old;
-	} else {
-		mutex_lock(&c->vfs_inodes_lock);
-		list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
-		mutex_unlock(&c->vfs_inodes_lock);
-		/*
-		 * we really don't want insert_inode_locked2() to be setting
-		 * I_NEW...
-		 */
-		unlock_new_inode(&inode->v);
-	}
-
-	return inode;
-}
-
-#define memalloc_flags_do(_flags, _do)						\
-({										\
-	unsigned _saved_flags = memalloc_flags_save(_flags);			\
-	typeof(_do) _ret = _do;							\
-	memalloc_noreclaim_restore(_saved_flags);				\
-	_ret;									\
-})
-
-static struct inode *bch2_alloc_inode(struct super_block *sb)
-{
-	BUG();
-}
-
-static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
-{
-	struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
-	if (!inode)
-		return NULL;
-
-	inode_init_once(&inode->v);
-	mutex_init(&inode->ei_update_lock);
-	two_state_lock_init(&inode->ei_pagecache_lock);
-	INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
-	mutex_init(&inode->ei_quota_lock);
-	inode->v.i_state = 0;
-
-	if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) {
-		kmem_cache_free(bch2_inode_cache, inode);
-		return NULL;
-	}
-
-	return inode;
-}
-
-/*
- * Allocate a new inode, dropping/retaking btree locks if necessary:
- */
-static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
-{
-	struct bch_inode_info *inode =
-		memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
-				  __bch2_new_inode(trans->c));
-
-	if (unlikely(!inode)) {
-		int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c)) ? 0 : -ENOMEM);
-		if (ret && inode) {
-			__destroy_inode(&inode->v);
-			kmem_cache_free(bch2_inode_cache, inode);
-		}
-		if (ret)
-			return ERR_PTR(ret);
-	}
-
-	return inode;
-}
-
-struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
-{
-	struct bch_inode_info *inode =
-		to_bch_ei(ilookup5_nowait(c->vfs_sb,
-					  bch2_inode_hash(inum),
-					  bch2_iget5_test,
-					  &inum));
-	if (inode)
-		return &inode->v;
-
-	struct btree_trans *trans = bch2_trans_get(c);
-
-	struct bch_inode_unpacked inode_u;
-	struct bch_subvolume subvol;
-	int ret = lockrestart_do(trans,
-		bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
-		bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
-		PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
-	if (!ret) {
-		bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
-		inode = bch2_inode_insert(c, inode);
-	}
-	bch2_trans_put(trans);
-
-	return ret ? ERR_PTR(ret) : &inode->v;
-}
-
-struct bch_inode_info *
-__bch2_create(struct mnt_idmap *idmap,
-	      struct bch_inode_info *dir, struct dentry *dentry,
-	      umode_t mode, dev_t rdev, subvol_inum snapshot_src,
-	      unsigned flags)
-{
-	struct bch_fs *c = dir->v.i_sb->s_fs_info;
-	struct btree_trans *trans;
-	struct bch_inode_unpacked dir_u;
-	struct bch_inode_info *inode;
-	struct bch_inode_unpacked inode_u;
-	struct posix_acl *default_acl = NULL, *acl = NULL;
-	subvol_inum inum;
-	struct bch_subvolume subvol;
-	u64 journal_seq = 0;
-	int ret;
-
-	/*
-	 * preallocate acls + vfs inode before btree transaction, so that
-	 * nothing can fail after the transaction succeeds:
-	 */
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-	ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
-	if (ret)
-		return ERR_PTR(ret);
-#endif
-	inode = __bch2_new_inode(c);
-	if (unlikely(!inode)) {
-		inode = ERR_PTR(-ENOMEM);
-		goto err;
-	}
-
-	bch2_inode_init_early(c, &inode_u);
-
-	if (!(flags & BCH_CREATE_TMPFILE))
-		mutex_lock(&dir->ei_update_lock);
-
-	trans = bch2_trans_get(c);
-retry:
-	bch2_trans_begin(trans);
-
-	ret   = bch2_subvol_is_ro_trans(trans, dir->ei_subvol) ?:
-		bch2_create_trans(trans,
-				  inode_inum(dir), &dir_u, &inode_u,
-				  !(flags & BCH_CREATE_TMPFILE)
-				  ? &dentry->d_name : NULL,
-				  from_kuid(i_user_ns(&dir->v), current_fsuid()),
-				  from_kgid(i_user_ns(&dir->v), current_fsgid()),
-				  mode, rdev,
-				  default_acl, acl, snapshot_src, flags) ?:
-		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
-				KEY_TYPE_QUOTA_PREALLOC);
-	if (unlikely(ret))
-		goto err_before_quota;
-
-	inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
-	inum.inum = inode_u.bi_inum;
-
-	ret   = bch2_subvolume_get(trans, inum.subvol, true,
-				   BTREE_ITER_with_updates, &subvol) ?:
-		bch2_trans_commit(trans, NULL, &journal_seq, 0);
-	if (unlikely(ret)) {
-		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
-				KEY_TYPE_QUOTA_WARN);
-err_before_quota:
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			goto retry;
-		goto err_trans;
-	}
-
-	if (!(flags & BCH_CREATE_TMPFILE)) {
-		bch2_inode_update_after_write(trans, dir, &dir_u,
-					      ATTR_MTIME|ATTR_CTIME);
-		mutex_unlock(&dir->ei_update_lock);
-	}
-
-	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
-
-	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
-	set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
-
-	/*
-	 * we must insert the new inode into the inode cache before calling
-	 * bch2_trans_exit() and dropping locks, else we could race with another
-	 * thread pulling the inode in and modifying it:
-	 */
-	inode = bch2_inode_insert(c, inode);
-	bch2_trans_put(trans);
-err:
-	posix_acl_release(default_acl);
-	posix_acl_release(acl);
-	return inode;
-err_trans:
-	if (!(flags & BCH_CREATE_TMPFILE))
-		mutex_unlock(&dir->ei_update_lock);
-
-	bch2_trans_put(trans);
-	make_bad_inode(&inode->v);
-	iput(&inode->v);
-	inode = ERR_PTR(ret);
-	goto err;
-}
-
-/* methods */
-
-static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
-			subvol_inum dir, struct bch_hash_info *dir_hash_info,
-			const struct qstr *name)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter dirent_iter = {};
-	subvol_inum inum = {};
-	struct printbuf buf = PRINTBUF;
-
-	struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
-					     dir_hash_info, dir, name, 0);
-	int ret = bkey_err(k);
-	if (ret)
-		return ERR_PTR(ret);
-
-	ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
-	if (ret > 0)
-		ret = -ENOENT;
-	if (ret)
-		goto err;
-
-	struct bch_inode_info *inode =
-		to_bch_ei(ilookup5_nowait(c->vfs_sb,
-					  bch2_inode_hash(inum),
-					  bch2_iget5_test,
-					  &inum));
-	if (inode)
-		goto out;
-
-	struct bch_subvolume subvol;
-	struct bch_inode_unpacked inode_u;
-	ret =   bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
-		bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
-		PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
-
-	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
-				c, "dirent to missing inode:\n  %s",
-				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
-	if (ret)
-		goto err;
-
-	/* regular files may have hardlinks: */
-	if (bch2_fs_inconsistent_on(bch2_inode_should_have_bp(&inode_u) &&
-				    !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)),
-				    c,
-				    "dirent points to inode that does not point back:\n  %s",
-				    (bch2_bkey_val_to_text(&buf, c, k),
-				     prt_printf(&buf, "\n  "),
-				     bch2_inode_unpacked_to_text(&buf, &inode_u),
-				     buf.buf))) {
-		ret = -ENOENT;
-		goto err;
-	}
-
-	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
-	inode = bch2_inode_insert(c, inode);
-out:
-	bch2_trans_iter_exit(trans, &dirent_iter);
-	printbuf_exit(&buf);
-	return inode;
-err:
-	inode = ERR_PTR(ret);
-	goto out;
-}
-
-static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
-				  unsigned int flags)
-{
-	struct bch_fs *c = vdir->i_sb->s_fs_info;
-	struct bch_inode_info *dir = to_bch_ei(vdir);
-	struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
-
-	struct bch_inode_info *inode;
-	bch2_trans_do(c, NULL, NULL, 0,
-		PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
-							  &hash, &dentry->d_name)));
-	if (IS_ERR(inode))
-		inode = NULL;
-
-	return d_splice_alias(&inode->v, dentry);
-}
-
-static int bch2_mknod(struct mnt_idmap *idmap,
-		      struct inode *vdir, struct dentry *dentry,
-		      umode_t mode, dev_t rdev)
-{
-	struct bch_inode_info *inode =
-		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev,
-			      (subvol_inum) { 0 }, 0);
-
-	if (IS_ERR(inode))
-		return bch2_err_class(PTR_ERR(inode));
-
-	d_instantiate(dentry, &inode->v);
-	return 0;
-}
-
-static int bch2_create(struct mnt_idmap *idmap,
-		       struct inode *vdir, struct dentry *dentry,
-		       umode_t mode, bool excl)
-{
-	return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0);
-}
-
-static int __bch2_link(struct bch_fs *c,
-		       struct bch_inode_info *inode,
-		       struct bch_inode_info *dir,
-		       struct dentry *dentry)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct bch_inode_unpacked dir_u, inode_u;
-	int ret;
-
-	mutex_lock(&inode->ei_update_lock);
-
-	ret = commit_do(trans, NULL, NULL, 0,
-			bch2_link_trans(trans,
-					inode_inum(dir),   &dir_u,
-					inode_inum(inode), &inode_u,
-					&dentry->d_name));
-
-	if (likely(!ret)) {
-		bch2_inode_update_after_write(trans, dir, &dir_u,
-					      ATTR_MTIME|ATTR_CTIME);
-		bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
-	}
-
-	bch2_trans_put(trans);
-	mutex_unlock(&inode->ei_update_lock);
-	return ret;
-}
-
-static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
-		     struct dentry *dentry)
-{
-	struct bch_fs *c = vdir->i_sb->s_fs_info;
-	struct bch_inode_info *dir = to_bch_ei(vdir);
-	struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
-	int ret;
-
-	lockdep_assert_held(&inode->v.i_rwsem);
-
-	ret   = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
-		bch2_subvol_is_ro(c, inode->ei_subvol) ?:
-		__bch2_link(c, inode, dir, dentry);
-	if (unlikely(ret))
-		return bch2_err_class(ret);
-
-	ihold(&inode->v);
-	d_instantiate(dentry, &inode->v);
-	return 0;
-}
-
-int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
-		  bool deleting_snapshot)
-{
-	struct bch_fs *c = vdir->i_sb->s_fs_info;
-	struct bch_inode_info *dir = to_bch_ei(vdir);
-	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
-	struct bch_inode_unpacked dir_u, inode_u;
-	struct btree_trans *trans = bch2_trans_get(c);
-	int ret;
-
-	bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
-
-	ret = commit_do(trans, NULL, NULL,
-			BCH_TRANS_COMMIT_no_enospc,
-		bch2_unlink_trans(trans,
-				  inode_inum(dir), &dir_u,
-				  &inode_u, &dentry->d_name,
-				  deleting_snapshot));
-	if (unlikely(ret))
-		goto err;
-
-	bch2_inode_update_after_write(trans, dir, &dir_u,
-				      ATTR_MTIME|ATTR_CTIME);
-	bch2_inode_update_after_write(trans, inode, &inode_u,
-				      ATTR_MTIME);
-
-	if (inode_u.bi_subvol) {
-		/*
-		 * Subvolume deletion is asynchronous, but we still want to tell
-		 * the VFS that it's been deleted here:
-		 */
-		set_nlink(&inode->v, 0);
-	}
-err:
-	bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
-	bch2_trans_put(trans);
-
-	return ret;
-}
-
-static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
-{
-	struct bch_inode_info *dir= to_bch_ei(vdir);
-	struct bch_fs *c = dir->v.i_sb->s_fs_info;
-
-	int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
-		__bch2_unlink(vdir, dentry, false);
-	return bch2_err_class(ret);
-}
-
-static int bch2_symlink(struct mnt_idmap *idmap,
-			struct inode *vdir, struct dentry *dentry,
-			const char *symname)
-{
-	struct bch_fs *c = vdir->i_sb->s_fs_info;
-	struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
-	int ret;
-
-	inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
-			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
-	if (IS_ERR(inode))
-		return bch2_err_class(PTR_ERR(inode));
-
-	inode_lock(&inode->v);
-	ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
-	inode_unlock(&inode->v);
-
-	if (unlikely(ret))
-		goto err;
-
-	ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
-	if (unlikely(ret))
-		goto err;
-
-	ret = __bch2_link(c, inode, dir, dentry);
-	if (unlikely(ret))
-		goto err;
-
-	d_instantiate(dentry, &inode->v);
-	return 0;
-err:
-	iput(&inode->v);
-	return bch2_err_class(ret);
-}
-
-static int bch2_mkdir(struct mnt_idmap *idmap,
-		      struct inode *vdir, struct dentry *dentry, umode_t mode)
-{
-	return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0);
-}
-
-static int bch2_rename2(struct mnt_idmap *idmap,
-			struct inode *src_vdir, struct dentry *src_dentry,
-			struct inode *dst_vdir, struct dentry *dst_dentry,
-			unsigned flags)
-{
-	struct bch_fs *c = src_vdir->i_sb->s_fs_info;
-	struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
-	struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
-	struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
-	struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
-	struct bch_inode_unpacked dst_dir_u, src_dir_u;
-	struct bch_inode_unpacked src_inode_u, dst_inode_u;
-	struct btree_trans *trans;
-	enum bch_rename_mode mode = flags & RENAME_EXCHANGE
-		? BCH_RENAME_EXCHANGE
-		: dst_dentry->d_inode
-		? BCH_RENAME_OVERWRITE : BCH_RENAME;
-	int ret;
-
-	if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
-		return -EINVAL;
-
-	if (mode == BCH_RENAME_OVERWRITE) {
-		ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
-						   0, LLONG_MAX);
-		if (ret)
-			return ret;
-	}
-
-	trans = bch2_trans_get(c);
-
-	bch2_lock_inodes(INODE_UPDATE_LOCK,
-			 src_dir,
-			 dst_dir,
-			 src_inode,
-			 dst_inode);
-
-	ret   = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?:
-		bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol);
-	if (ret)
-		goto err;
-
-	if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
-		ret = bch2_fs_quota_transfer(c, src_inode,
-					     dst_dir->ei_qid,
-					     1 << QTYP_PRJ,
-					     KEY_TYPE_QUOTA_PREALLOC);
-		if (ret)
-			goto err;
-	}
-
-	if (mode == BCH_RENAME_EXCHANGE &&
-	    inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
-		ret = bch2_fs_quota_transfer(c, dst_inode,
-					     src_dir->ei_qid,
-					     1 << QTYP_PRJ,
-					     KEY_TYPE_QUOTA_PREALLOC);
-		if (ret)
-			goto err;
-	}
-
-	ret = commit_do(trans, NULL, NULL, 0,
-			bch2_rename_trans(trans,
-					  inode_inum(src_dir), &src_dir_u,
-					  inode_inum(dst_dir), &dst_dir_u,
-					  &src_inode_u,
-					  &dst_inode_u,
-					  &src_dentry->d_name,
-					  &dst_dentry->d_name,
-					  mode));
-	if (unlikely(ret))
-		goto err;
-
-	BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
-	BUG_ON(dst_inode &&
-	       dst_inode->v.i_ino != dst_inode_u.bi_inum);
-
-	bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
-				      ATTR_MTIME|ATTR_CTIME);
-
-	if (src_dir != dst_dir)
-		bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
-					      ATTR_MTIME|ATTR_CTIME);
-
-	bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
-				      ATTR_CTIME);
-
-	if (dst_inode)
-		bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
-					      ATTR_CTIME);
-err:
-	bch2_trans_put(trans);
-
-	bch2_fs_quota_transfer(c, src_inode,
-			       bch_qid(&src_inode->ei_inode),
-			       1 << QTYP_PRJ,
-			       KEY_TYPE_QUOTA_NOCHECK);
-	if (dst_inode)
-		bch2_fs_quota_transfer(c, dst_inode,
-				       bch_qid(&dst_inode->ei_inode),
-				       1 << QTYP_PRJ,
-				       KEY_TYPE_QUOTA_NOCHECK);
-
-	bch2_unlock_inodes(INODE_UPDATE_LOCK,
-			   src_dir,
-			   dst_dir,
-			   src_inode,
-			   dst_inode);
-
-	return bch2_err_class(ret);
-}
-
-static void bch2_setattr_copy(struct mnt_idmap *idmap,
-			      struct bch_inode_info *inode,
-			      struct bch_inode_unpacked *bi,
-			      struct iattr *attr)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	unsigned int ia_valid = attr->ia_valid;
-
-	if (ia_valid & ATTR_UID)
-		bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
-	if (ia_valid & ATTR_GID)
-		bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
-
-	if (ia_valid & ATTR_SIZE)
-		bi->bi_size = attr->ia_size;
-
-	if (ia_valid & ATTR_ATIME)
-		bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
-	if (ia_valid & ATTR_MTIME)
-		bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
-	if (ia_valid & ATTR_CTIME)
-		bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
-
-	if (ia_valid & ATTR_MODE) {
-		umode_t mode = attr->ia_mode;
-		kgid_t gid = ia_valid & ATTR_GID
-			? attr->ia_gid
-			: inode->v.i_gid;
-
-		if (!in_group_p(gid) &&
-		    !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
-			mode &= ~S_ISGID;
-		bi->bi_mode = mode;
-	}
-}
-
-int bch2_setattr_nonsize(struct mnt_idmap *idmap,
-			 struct bch_inode_info *inode,
-			 struct iattr *attr)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_qid qid;
-	struct btree_trans *trans;
-	struct btree_iter inode_iter = { NULL };
-	struct bch_inode_unpacked inode_u;
-	struct posix_acl *acl = NULL;
-	int ret;
-
-	mutex_lock(&inode->ei_update_lock);
-
-	qid = inode->ei_qid;
-
-	if (attr->ia_valid & ATTR_UID)
-		qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
-
-	if (attr->ia_valid & ATTR_GID)
-		qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
-
-	ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
-				     KEY_TYPE_QUOTA_PREALLOC);
-	if (ret)
-		goto err;
-
-	trans = bch2_trans_get(c);
-retry:
-	bch2_trans_begin(trans);
-	kfree(acl);
-	acl = NULL;
-
-	ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
-			      BTREE_ITER_intent);
-	if (ret)
-		goto btree_err;
-
-	bch2_setattr_copy(idmap, inode, &inode_u, attr);
-
-	if (attr->ia_valid & ATTR_MODE) {
-		ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
-				     inode_u.bi_mode, &acl);
-		if (ret)
-			goto btree_err;
-	}
-
-	ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
-		bch2_trans_commit(trans, NULL, NULL,
-				  BCH_TRANS_COMMIT_no_enospc);
-btree_err:
-	bch2_trans_iter_exit(trans, &inode_iter);
-
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-	if (unlikely(ret))
-		goto err_trans;
-
-	bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
-
-	if (acl)
-		set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
-err_trans:
-	bch2_trans_put(trans);
-err:
-	mutex_unlock(&inode->ei_update_lock);
-
-	return bch2_err_class(ret);
-}
-
-static int bch2_getattr(struct mnt_idmap *idmap,
-			const struct path *path, struct kstat *stat,
-			u32 request_mask, unsigned query_flags)
-{
-	struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
-	stat->dev	= inode->v.i_sb->s_dev;
-	stat->ino	= inode->v.i_ino;
-	stat->mode	= inode->v.i_mode;
-	stat->nlink	= inode->v.i_nlink;
-	stat->uid	= inode->v.i_uid;
-	stat->gid	= inode->v.i_gid;
-	stat->rdev	= inode->v.i_rdev;
-	stat->size	= i_size_read(&inode->v);
-	stat->atime	= inode_get_atime(&inode->v);
-	stat->mtime	= inode_get_mtime(&inode->v);
-	stat->ctime	= inode_get_ctime(&inode->v);
-	stat->blksize	= block_bytes(c);
-	stat->blocks	= inode->v.i_blocks;
-
-	stat->subvol	= inode->ei_subvol;
-	stat->result_mask |= STATX_SUBVOL;
-
-	if (request_mask & STATX_BTIME) {
-		stat->result_mask |= STATX_BTIME;
-		stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
-	}
-
-	if (inode->ei_inode.bi_flags & BCH_INODE_immutable)
-		stat->attributes |= STATX_ATTR_IMMUTABLE;
-	stat->attributes_mask	 |= STATX_ATTR_IMMUTABLE;
-
-	if (inode->ei_inode.bi_flags & BCH_INODE_append)
-		stat->attributes |= STATX_ATTR_APPEND;
-	stat->attributes_mask	 |= STATX_ATTR_APPEND;
-
-	if (inode->ei_inode.bi_flags & BCH_INODE_nodump)
-		stat->attributes |= STATX_ATTR_NODUMP;
-	stat->attributes_mask	 |= STATX_ATTR_NODUMP;
-
-	return 0;
-}
-
-static int bch2_setattr(struct mnt_idmap *idmap,
-			struct dentry *dentry, struct iattr *iattr)
-{
-	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	int ret;
-
-	lockdep_assert_held(&inode->v.i_rwsem);
-
-	ret   = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
-		setattr_prepare(idmap, dentry, iattr);
-	if (ret)
-		return ret;
-
-	return iattr->ia_valid & ATTR_SIZE
-		? bchfs_truncate(idmap, inode, iattr)
-		: bch2_setattr_nonsize(idmap, inode, iattr);
-}
-
-static int bch2_tmpfile(struct mnt_idmap *idmap,
-			struct inode *vdir, struct file *file, umode_t mode)
-{
-	struct bch_inode_info *inode =
-		__bch2_create(idmap, to_bch_ei(vdir),
-			      file->f_path.dentry, mode, 0,
-			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
-
-	if (IS_ERR(inode))
-		return bch2_err_class(PTR_ERR(inode));
-
-	d_mark_tmpfile(file, &inode->v);
-	d_instantiate(file->f_path.dentry, &inode->v);
-	return finish_open_simple(file, 0);
-}
-
-static int bch2_fill_extent(struct bch_fs *c,
-			    struct fiemap_extent_info *info,
-			    struct bkey_s_c k, unsigned flags)
-{
-	if (bkey_extent_is_direct_data(k.k)) {
-		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-		const union bch_extent_entry *entry;
-		struct extent_ptr_decoded p;
-		int ret;
-
-		if (k.k->type == KEY_TYPE_reflink_v)
-			flags |= FIEMAP_EXTENT_SHARED;
-
-		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-			int flags2 = 0;
-			u64 offset = p.ptr.offset;
-
-			if (p.ptr.unwritten)
-				flags2 |= FIEMAP_EXTENT_UNWRITTEN;
-
-			if (p.crc.compression_type)
-				flags2 |= FIEMAP_EXTENT_ENCODED;
-			else
-				offset += p.crc.offset;
-
-			if ((offset & (block_sectors(c) - 1)) ||
-			    (k.k->size & (block_sectors(c) - 1)))
-				flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
-
-			ret = fiemap_fill_next_extent(info,
-						bkey_start_offset(k.k) << 9,
-						offset << 9,
-						k.k->size << 9, flags|flags2);
-			if (ret)
-				return ret;
-		}
-
-		return 0;
-	} else if (bkey_extent_is_inline_data(k.k)) {
-		return fiemap_fill_next_extent(info,
-					       bkey_start_offset(k.k) << 9,
-					       0, k.k->size << 9,
-					       flags|
-					       FIEMAP_EXTENT_DATA_INLINE);
-	} else if (k.k->type == KEY_TYPE_reservation) {
-		return fiemap_fill_next_extent(info,
-					       bkey_start_offset(k.k) << 9,
-					       0, k.k->size << 9,
-					       flags|
-					       FIEMAP_EXTENT_DELALLOC|
-					       FIEMAP_EXTENT_UNWRITTEN);
-	} else {
-		BUG();
-	}
-}
-
-static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
-		       u64 start, u64 len)
-{
-	struct bch_fs *c = vinode->i_sb->s_fs_info;
-	struct bch_inode_info *ei = to_bch_ei(vinode);
-	struct btree_trans *trans;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey_buf cur, prev;
-	unsigned offset_into_extent, sectors;
-	bool have_extent = false;
-	u32 snapshot;
-	int ret = 0;
-
-	ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
-	if (ret)
-		return ret;
-
-	struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
-	if (start + len < start)
-		return -EINVAL;
-
-	start >>= 9;
-
-	bch2_bkey_buf_init(&cur);
-	bch2_bkey_buf_init(&prev);
-	trans = bch2_trans_get(c);
-retry:
-	bch2_trans_begin(trans);
-
-	ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
-	if (ret)
-		goto err;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-			     SPOS(ei->v.i_ino, start, snapshot), 0);
-
-	while (!(ret = btree_trans_too_many_iters(trans)) &&
-	       (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
-	       !(ret = bkey_err(k))) {
-		enum btree_id data_btree = BTREE_ID_extents;
-
-		if (!bkey_extent_is_data(k.k) &&
-		    k.k->type != KEY_TYPE_reservation) {
-			bch2_btree_iter_advance(&iter);
-			continue;
-		}
-
-		offset_into_extent	= iter.pos.offset -
-			bkey_start_offset(k.k);
-		sectors			= k.k->size - offset_into_extent;
-
-		bch2_bkey_buf_reassemble(&cur, c, k);
-
-		ret = bch2_read_indirect_extent(trans, &data_btree,
-					&offset_into_extent, &cur);
-		if (ret)
-			break;
-
-		k = bkey_i_to_s_c(cur.k);
-		bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
-
-		sectors = min(sectors, k.k->size - offset_into_extent);
-
-		bch2_cut_front(POS(k.k->p.inode,
-				   bkey_start_offset(k.k) +
-				   offset_into_extent),
-			       cur.k);
-		bch2_key_resize(&cur.k->k, sectors);
-		cur.k->k.p = iter.pos;
-		cur.k->k.p.offset += cur.k->k.size;
-
-		if (have_extent) {
-			bch2_trans_unlock(trans);
-			ret = bch2_fill_extent(c, info,
-					bkey_i_to_s_c(prev.k), 0);
-			if (ret)
-				break;
-		}
-
-		bkey_copy(prev.k, cur.k);
-		have_extent = true;
-
-		bch2_btree_iter_set_pos(&iter,
-			POS(iter.pos.inode, iter.pos.offset + sectors));
-
-		ret = bch2_trans_relock(trans);
-		if (ret)
-			break;
-	}
-	start = iter.pos.offset;
-	bch2_trans_iter_exit(trans, &iter);
-err:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	if (!ret && have_extent) {
-		bch2_trans_unlock(trans);
-		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
-				       FIEMAP_EXTENT_LAST);
-	}
-
-	bch2_trans_put(trans);
-	bch2_bkey_buf_exit(&cur, c);
-	bch2_bkey_buf_exit(&prev, c);
-	return ret < 0 ? ret : 0;
-}
-
-static const struct vm_operations_struct bch_vm_ops = {
-	.fault		= bch2_page_fault,
-	.map_pages	= filemap_map_pages,
-	.page_mkwrite   = bch2_page_mkwrite,
-};
-
-static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	file_accessed(file);
-
-	vma->vm_ops = &bch_vm_ops;
-	return 0;
-}
-
-/* Directories: */
-
-static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
-{
-	return generic_file_llseek_size(file, offset, whence,
-					S64_MAX, S64_MAX);
-}
-
-static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
-{
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
-	if (!dir_emit_dots(file, ctx))
-		return 0;
-
-	int ret = bch2_readdir(c, inode_inum(inode), ctx);
-
-	bch_err_fn(c, ret);
-	return bch2_err_class(ret);
-}
-
-static int bch2_open(struct inode *vinode, struct file *file)
-{
-	if (file->f_flags & (O_WRONLY|O_RDWR)) {
-		struct bch_inode_info *inode = to_bch_ei(vinode);
-		struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
-		int ret = bch2_subvol_is_ro(c, inode->ei_subvol);
-		if (ret)
-			return ret;
-	}
-
-	return generic_file_open(vinode, file);
-}
-
-static const struct file_operations bch_file_operations = {
-	.open		= bch2_open,
-	.llseek		= bch2_llseek,
-	.read_iter	= bch2_read_iter,
-	.write_iter	= bch2_write_iter,
-	.mmap		= bch2_mmap,
-	.fsync		= bch2_fsync,
-	.splice_read	= filemap_splice_read,
-	.splice_write	= iter_file_splice_write,
-	.fallocate	= bch2_fallocate_dispatch,
-	.unlocked_ioctl = bch2_fs_file_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= bch2_compat_fs_ioctl,
-#endif
-	.remap_file_range = bch2_remap_file_range,
-};
-
-static const struct inode_operations bch_file_inode_operations = {
-	.getattr	= bch2_getattr,
-	.setattr	= bch2_setattr,
-	.fiemap		= bch2_fiemap,
-	.listxattr	= bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-	.get_acl	= bch2_get_acl,
-	.set_acl	= bch2_set_acl,
-#endif
-};
-
-static const struct inode_operations bch_dir_inode_operations = {
-	.lookup		= bch2_lookup,
-	.create		= bch2_create,
-	.link		= bch2_link,
-	.unlink		= bch2_unlink,
-	.symlink	= bch2_symlink,
-	.mkdir		= bch2_mkdir,
-	.rmdir		= bch2_unlink,
-	.mknod		= bch2_mknod,
-	.rename		= bch2_rename2,
-	.getattr	= bch2_getattr,
-	.setattr	= bch2_setattr,
-	.tmpfile	= bch2_tmpfile,
-	.listxattr	= bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-	.get_acl	= bch2_get_acl,
-	.set_acl	= bch2_set_acl,
-#endif
-};
-
-static const struct file_operations bch_dir_file_operations = {
-	.llseek		= bch2_dir_llseek,
-	.read		= generic_read_dir,
-	.iterate_shared	= bch2_vfs_readdir,
-	.fsync		= bch2_fsync,
-	.unlocked_ioctl = bch2_fs_file_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= bch2_compat_fs_ioctl,
-#endif
-};
-
-static const struct inode_operations bch_symlink_inode_operations = {
-	.get_link	= page_get_link,
-	.getattr	= bch2_getattr,
-	.setattr	= bch2_setattr,
-	.listxattr	= bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-	.get_acl	= bch2_get_acl,
-	.set_acl	= bch2_set_acl,
-#endif
-};
-
-static const struct inode_operations bch_special_inode_operations = {
-	.getattr	= bch2_getattr,
-	.setattr	= bch2_setattr,
-	.listxattr	= bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-	.get_acl	= bch2_get_acl,
-	.set_acl	= bch2_set_acl,
-#endif
-};
-
-static const struct address_space_operations bch_address_space_operations = {
-	.read_folio	= bch2_read_folio,
-	.writepages	= bch2_writepages,
-	.readahead	= bch2_readahead,
-	.dirty_folio	= filemap_dirty_folio,
-	.write_begin	= bch2_write_begin,
-	.write_end	= bch2_write_end,
-	.invalidate_folio = bch2_invalidate_folio,
-	.release_folio	= bch2_release_folio,
-	.direct_IO	= noop_direct_IO,
-#ifdef CONFIG_MIGRATION
-	.migrate_folio	= filemap_migrate_folio,
-#endif
-	.error_remove_folio = generic_error_remove_folio,
-};
-
-struct bcachefs_fid {
-	u64		inum;
-	u32		subvol;
-	u32		gen;
-} __packed;
-
-struct bcachefs_fid_with_parent {
-	struct bcachefs_fid	fid;
-	struct bcachefs_fid	dir;
-} __packed;
-
-static int bcachefs_fid_valid(int fh_len, int fh_type)
-{
-	switch (fh_type) {
-	case FILEID_BCACHEFS_WITHOUT_PARENT:
-		return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
-	case FILEID_BCACHEFS_WITH_PARENT:
-		return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
-	default:
-		return false;
-	}
-}
-
-static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
-{
-	return (struct bcachefs_fid) {
-		.inum	= inode->ei_inode.bi_inum,
-		.subvol	= inode->ei_subvol,
-		.gen	= inode->ei_inode.bi_generation,
-	};
-}
-
-static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
-			  struct inode *vdir)
-{
-	struct bch_inode_info *inode	= to_bch_ei(vinode);
-	struct bch_inode_info *dir	= to_bch_ei(vdir);
-	int min_len;
-
-	if (!S_ISDIR(inode->v.i_mode) && dir) {
-		struct bcachefs_fid_with_parent *fid = (void *) fh;
-
-		min_len = sizeof(*fid) / sizeof(u32);
-		if (*len < min_len) {
-			*len = min_len;
-			return FILEID_INVALID;
-		}
-
-		fid->fid = bch2_inode_to_fid(inode);
-		fid->dir = bch2_inode_to_fid(dir);
-
-		*len = min_len;
-		return FILEID_BCACHEFS_WITH_PARENT;
-	} else {
-		struct bcachefs_fid *fid = (void *) fh;
-
-		min_len = sizeof(*fid) / sizeof(u32);
-		if (*len < min_len) {
-			*len = min_len;
-			return FILEID_INVALID;
-		}
-		*fid = bch2_inode_to_fid(inode);
-
-		*len = min_len;
-		return FILEID_BCACHEFS_WITHOUT_PARENT;
-	}
-}
-
-static struct inode *bch2_nfs_get_inode(struct super_block *sb,
-					struct bcachefs_fid fid)
-{
-	struct bch_fs *c = sb->s_fs_info;
-	struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
-				    .subvol = fid.subvol,
-				    .inum = fid.inum,
-	});
-	if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
-		iput(vinode);
-		vinode = ERR_PTR(-ESTALE);
-	}
-	return vinode;
-}
-
-static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
-		int fh_len, int fh_type)
-{
-	struct bcachefs_fid *fid = (void *) _fid;
-
-	if (!bcachefs_fid_valid(fh_len, fh_type))
-		return NULL;
-
-	return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
-}
-
-static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
-		int fh_len, int fh_type)
-{
-	struct bcachefs_fid_with_parent *fid = (void *) _fid;
-
-	if (!bcachefs_fid_valid(fh_len, fh_type) ||
-	    fh_type != FILEID_BCACHEFS_WITH_PARENT)
-		return NULL;
-
-	return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
-}
-
-static struct dentry *bch2_get_parent(struct dentry *child)
-{
-	struct bch_inode_info *inode = to_bch_ei(child->d_inode);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	subvol_inum parent_inum = {
-		.subvol = inode->ei_inode.bi_parent_subvol ?:
-			inode->ei_subvol,
-		.inum = inode->ei_inode.bi_dir,
-	};
-
-	return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
-}
-
-static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
-{
-	struct bch_inode_info *inode	= to_bch_ei(child->d_inode);
-	struct bch_inode_info *dir	= to_bch_ei(parent->d_inode);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans *trans;
-	struct btree_iter iter1;
-	struct btree_iter iter2;
-	struct bkey_s_c k;
-	struct bkey_s_c_dirent d;
-	struct bch_inode_unpacked inode_u;
-	subvol_inum target;
-	u32 snapshot;
-	struct qstr dirent_name;
-	unsigned name_len = 0;
-	int ret;
-
-	if (!S_ISDIR(dir->v.i_mode))
-		return -EINVAL;
-
-	trans = bch2_trans_get(c);
-
-	bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
-			     POS(dir->ei_inode.bi_inum, 0), 0);
-	bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
-			     POS(dir->ei_inode.bi_inum, 0), 0);
-retry:
-	bch2_trans_begin(trans);
-
-	ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
-	if (ret)
-		goto err;
-
-	bch2_btree_iter_set_snapshot(&iter1, snapshot);
-	bch2_btree_iter_set_snapshot(&iter2, snapshot);
-
-	ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
-	if (ret)
-		goto err;
-
-	if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
-		bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
-
-		k = bch2_btree_iter_peek_slot(&iter1);
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
-
-		if (k.k->type != KEY_TYPE_dirent) {
-			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
-			goto err;
-		}
-
-		d = bkey_s_c_to_dirent(k);
-		ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
-		if (ret > 0)
-			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
-		if (ret)
-			goto err;
-
-		if (target.subvol	== inode->ei_subvol &&
-		    target.inum		== inode->ei_inode.bi_inum)
-			goto found;
-	} else {
-		/*
-		 * File with multiple hardlinks and our backref is to the wrong
-		 * directory - linear search:
-		 */
-		for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
-			if (k.k->p.inode > dir->ei_inode.bi_inum)
-				break;
-
-			if (k.k->type != KEY_TYPE_dirent)
-				continue;
-
-			d = bkey_s_c_to_dirent(k);
-			ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
-			if (ret < 0)
-				break;
-			if (ret)
-				continue;
-
-			if (target.subvol	== inode->ei_subvol &&
-			    target.inum		== inode->ei_inode.bi_inum)
-				goto found;
-		}
-	}
-
-	ret = -ENOENT;
-	goto err;
-found:
-	dirent_name = bch2_dirent_get_name(d);
-
-	name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
-	memcpy(name, dirent_name.name, name_len);
-	name[name_len] = '\0';
-err:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	bch2_trans_iter_exit(trans, &iter1);
-	bch2_trans_iter_exit(trans, &iter2);
-	bch2_trans_put(trans);
-
-	return ret;
-}
-
-static const struct export_operations bch_export_ops = {
-	.encode_fh	= bch2_encode_fh,
-	.fh_to_dentry	= bch2_fh_to_dentry,
-	.fh_to_parent	= bch2_fh_to_parent,
-	.get_parent	= bch2_get_parent,
-	.get_name	= bch2_get_name,
-};
-
-static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
-				struct bch_inode_info *inode,
-				struct bch_inode_unpacked *bi,
-				struct bch_subvolume *subvol)
-{
-	bch2_iget5_set(&inode->v, &inum);
-	bch2_inode_update_after_write(trans, inode, bi, ~0);
-
-	if (BCH_SUBVOLUME_SNAP(subvol))
-		set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
-	else
-		clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
-
-	inode->v.i_blocks	= bi->bi_sectors;
-	inode->v.i_ino		= bi->bi_inum;
-	inode->v.i_rdev		= bi->bi_dev;
-	inode->v.i_generation	= bi->bi_generation;
-	inode->v.i_size		= bi->bi_size;
-
-	inode->ei_flags		= 0;
-	inode->ei_quota_reserved = 0;
-	inode->ei_qid		= bch_qid(bi);
-	inode->ei_subvol	= inum.subvol;
-
-	inode->v.i_mapping->a_ops = &bch_address_space_operations;
-
-	switch (inode->v.i_mode & S_IFMT) {
-	case S_IFREG:
-		inode->v.i_op	= &bch_file_inode_operations;
-		inode->v.i_fop	= &bch_file_operations;
-		break;
-	case S_IFDIR:
-		inode->v.i_op	= &bch_dir_inode_operations;
-		inode->v.i_fop	= &bch_dir_file_operations;
-		break;
-	case S_IFLNK:
-		inode_nohighmem(&inode->v);
-		inode->v.i_op	= &bch_symlink_inode_operations;
-		break;
-	default:
-		init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
-		inode->v.i_op	= &bch_special_inode_operations;
-		break;
-	}
-
-	mapping_set_large_folios(inode->v.i_mapping);
-}
-
-static void bch2_free_inode(struct inode *vinode)
-{
-	kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode));
-}
-
-static int inode_update_times_fn(struct btree_trans *trans,
-				 struct bch_inode_info *inode,
-				 struct bch_inode_unpacked *bi,
-				 void *p)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
-	bi->bi_atime	= timespec_to_bch2_time(c, inode_get_atime(&inode->v));
-	bi->bi_mtime	= timespec_to_bch2_time(c, inode_get_mtime(&inode->v));
-	bi->bi_ctime	= timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
-
-	return 0;
-}
-
-static int bch2_vfs_write_inode(struct inode *vinode,
-				struct writeback_control *wbc)
-{
-	struct bch_fs *c = vinode->i_sb->s_fs_info;
-	struct bch_inode_info *inode = to_bch_ei(vinode);
-	int ret;
-
-	mutex_lock(&inode->ei_update_lock);
-	ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
-			       ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
-	mutex_unlock(&inode->ei_update_lock);
-
-	return bch2_err_class(ret);
-}
-
-static void bch2_evict_inode(struct inode *vinode)
-{
-	struct bch_fs *c = vinode->i_sb->s_fs_info;
-	struct bch_inode_info *inode = to_bch_ei(vinode);
-
-	truncate_inode_pages_final(&inode->v.i_data);
-
-	clear_inode(&inode->v);
-
-	BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
-
-	if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
-		bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
-				KEY_TYPE_QUOTA_WARN);
-		bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
-				KEY_TYPE_QUOTA_WARN);
-		bch2_inode_rm(c, inode_inum(inode));
-	}
-
-	mutex_lock(&c->vfs_inodes_lock);
-	list_del_init(&inode->ei_vfs_inode_list);
-	mutex_unlock(&c->vfs_inodes_lock);
-}
-
-void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
-{
-	struct bch_inode_info *inode;
-	DARRAY(struct bch_inode_info *) grabbed;
-	bool clean_pass = false, this_pass_clean;
-
-	/*
-	 * Initially, we scan for inodes without I_DONTCACHE, then mark them to
-	 * be pruned with d_mark_dontcache().
-	 *
-	 * Once we've had a clean pass where we didn't find any inodes without
-	 * I_DONTCACHE, we wait for them to be freed:
-	 */
-
-	darray_init(&grabbed);
-	darray_make_room(&grabbed, 1024);
-again:
-	cond_resched();
-	this_pass_clean = true;
-
-	mutex_lock(&c->vfs_inodes_lock);
-	list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
-		if (!snapshot_list_has_id(s, inode->ei_subvol))
-			continue;
-
-		if (!(inode->v.i_state & I_DONTCACHE) &&
-		    !(inode->v.i_state & I_FREEING) &&
-		    igrab(&inode->v)) {
-			this_pass_clean = false;
-
-			if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) {
-				iput(&inode->v);
-				break;
-			}
-		} else if (clean_pass && this_pass_clean) {
-			wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW);
-			DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
-
-			prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
-			mutex_unlock(&c->vfs_inodes_lock);
-
-			schedule();
-			finish_wait(wq, &wait.wq_entry);
-			goto again;
-		}
-	}
-	mutex_unlock(&c->vfs_inodes_lock);
-
-	darray_for_each(grabbed, i) {
-		inode = *i;
-		d_mark_dontcache(&inode->v);
-		d_prune_aliases(&inode->v);
-		iput(&inode->v);
-	}
-	grabbed.nr = 0;
-
-	if (!clean_pass || !this_pass_clean) {
-		clean_pass = this_pass_clean;
-		goto again;
-	}
-
-	darray_exit(&grabbed);
-}
-
-static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-	struct super_block *sb = dentry->d_sb;
-	struct bch_fs *c = sb->s_fs_info;
-	struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
-	unsigned shift = sb->s_blocksize_bits - 9;
-	/*
-	 * this assumes inodes take up 64 bytes, which is a decent average
-	 * number:
-	 */
-	u64 avail_inodes = ((usage.capacity - usage.used) << 3);
-
-	buf->f_type	= BCACHEFS_STATFS_MAGIC;
-	buf->f_bsize	= sb->s_blocksize;
-	buf->f_blocks	= usage.capacity >> shift;
-	buf->f_bfree	= usage.free >> shift;
-	buf->f_bavail	= avail_factor(usage.free) >> shift;
-
-	buf->f_files	= usage.nr_inodes + avail_inodes;
-	buf->f_ffree	= avail_inodes;
-
-	buf->f_fsid	= uuid_to_fsid(c->sb.user_uuid.b);
-	buf->f_namelen	= BCH_NAME_MAX;
-
-	return 0;
-}
-
-static int bch2_sync_fs(struct super_block *sb, int wait)
-{
-	struct bch_fs *c = sb->s_fs_info;
-	int ret;
-
-	if (c->opts.journal_flush_disabled)
-		return 0;
-
-	if (!wait) {
-		bch2_journal_flush_async(&c->journal, NULL);
-		return 0;
-	}
-
-	ret = bch2_journal_flush(&c->journal);
-	return bch2_err_class(ret);
-}
-
-static struct bch_fs *bch2_path_to_fs(const char *path)
-{
-	struct bch_fs *c;
-	dev_t dev;
-	int ret;
-
-	ret = lookup_bdev(path, &dev);
-	if (ret)
-		return ERR_PTR(ret);
-
-	c = bch2_dev_to_fs(dev);
-	if (c)
-		closure_put(&c->cl);
-	return c ?: ERR_PTR(-ENOENT);
-}
-
-static int bch2_remount(struct super_block *sb, int *flags, char *data)
-{
-	struct bch_fs *c = sb->s_fs_info;
-	struct bch_opts opts = bch2_opts_empty();
-	int ret;
-
-	ret = bch2_parse_mount_opts(c, &opts, data);
-	if (ret)
-		goto err;
-
-	opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
-
-	if (opts.read_only != c->opts.read_only) {
-		down_write(&c->state_lock);
-
-		if (opts.read_only) {
-			bch2_fs_read_only(c);
-
-			sb->s_flags |= SB_RDONLY;
-		} else {
-			ret = bch2_fs_read_write(c);
-			if (ret) {
-				bch_err(c, "error going rw: %i", ret);
-				up_write(&c->state_lock);
-				ret = -EINVAL;
-				goto err;
-			}
-
-			sb->s_flags &= ~SB_RDONLY;
-		}
-
-		c->opts.read_only = opts.read_only;
-
-		up_write(&c->state_lock);
-	}
-
-	if (opt_defined(opts, errors))
-		c->opts.errors = opts.errors;
-err:
-	return bch2_err_class(ret);
-}
-
-static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
-{
-	struct bch_fs *c = root->d_sb->s_fs_info;
-	bool first = true;
-
-	for_each_online_member(c, ca) {
-		if (!first)
-			seq_putc(seq, ':');
-		first = false;
-		seq_puts(seq, ca->disk_sb.sb_name);
-	}
-
-	return 0;
-}
-
-static int bch2_show_options(struct seq_file *seq, struct dentry *root)
-{
-	struct bch_fs *c = root->d_sb->s_fs_info;
-	enum bch_opt_id i;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	for (i = 0; i < bch2_opts_nr; i++) {
-		const struct bch_option *opt = &bch2_opt_table[i];
-		u64 v = bch2_opt_get_by_id(&c->opts, i);
-
-		if (!(opt->flags & OPT_MOUNT))
-			continue;
-
-		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
-			continue;
-
-		printbuf_reset(&buf);
-		bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
-				 OPT_SHOW_MOUNT_STYLE);
-		seq_putc(seq, ',');
-		seq_puts(seq, buf.buf);
-	}
-
-	if (buf.allocation_failure)
-		ret = -ENOMEM;
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static void bch2_put_super(struct super_block *sb)
-{
-	struct bch_fs *c = sb->s_fs_info;
-
-	__bch2_fs_stop(c);
-}
-
-/*
- * bcachefs doesn't currently integrate intwrite freeze protection but the
- * internal write references serve the same purpose. Therefore reuse the
- * read-only transition code to perform the quiesce. The caveat is that we don't
- * currently have the ability to block tasks that want a write reference while
- * the superblock is frozen. This is fine for now, but we should either add
- * blocking support or find a way to integrate sb_start_intwrite() and friends.
- */
-static int bch2_freeze(struct super_block *sb)
-{
-	struct bch_fs *c = sb->s_fs_info;
-
-	down_write(&c->state_lock);
-	bch2_fs_read_only(c);
-	up_write(&c->state_lock);
-	return 0;
-}
-
-static int bch2_unfreeze(struct super_block *sb)
-{
-	struct bch_fs *c = sb->s_fs_info;
-	int ret;
-
-	if (test_bit(BCH_FS_emergency_ro, &c->flags))
-		return 0;
-
-	down_write(&c->state_lock);
-	ret = bch2_fs_read_write(c);
-	up_write(&c->state_lock);
-	return ret;
-}
-
-static const struct super_operations bch_super_operations = {
-	.alloc_inode	= bch2_alloc_inode,
-	.free_inode	= bch2_free_inode,
-	.write_inode	= bch2_vfs_write_inode,
-	.evict_inode	= bch2_evict_inode,
-	.sync_fs	= bch2_sync_fs,
-	.statfs		= bch2_statfs,
-	.show_devname	= bch2_show_devname,
-	.show_options	= bch2_show_options,
-	.remount_fs	= bch2_remount,
-	.put_super	= bch2_put_super,
-	.freeze_fs	= bch2_freeze,
-	.unfreeze_fs	= bch2_unfreeze,
-};
-
-static int bch2_set_super(struct super_block *s, void *data)
-{
-	s->s_fs_info = data;
-	return 0;
-}
-
-static int bch2_noset_super(struct super_block *s, void *data)
-{
-	return -EBUSY;
-}
-
-typedef DARRAY(struct bch_fs *) darray_fs;
-
-static int bch2_test_super(struct super_block *s, void *data)
-{
-	struct bch_fs *c = s->s_fs_info;
-	darray_fs *d = data;
-
-	if (!c)
-		return false;
-
-	darray_for_each(*d, i)
-		if (c != *i)
-			return false;
-	return true;
-}
-
-static struct dentry *bch2_mount(struct file_system_type *fs_type,
-				 int flags, const char *dev_name, void *data)
-{
-	struct bch_fs *c;
-	struct super_block *sb;
-	struct inode *vinode;
-	struct bch_opts opts = bch2_opts_empty();
-	int ret;
-
-	opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
-
-	ret = bch2_parse_mount_opts(NULL, &opts, data);
-	if (ret) {
-		ret = bch2_err_class(ret);
-		return ERR_PTR(ret);
-	}
-
-	if (!dev_name || strlen(dev_name) == 0)
-		return ERR_PTR(-EINVAL);
-
-	darray_str devs;
-	ret = bch2_split_devs(dev_name, &devs);
-	if (ret)
-		return ERR_PTR(ret);
-
-	darray_fs devs_to_fs = {};
-	darray_for_each(devs, i) {
-		ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i));
-		if (ret) {
-			sb = ERR_PTR(ret);
-			goto got_sb;
-		}
-	}
-
-	sb = sget(fs_type, bch2_test_super, bch2_noset_super, flags|SB_NOSEC, &devs_to_fs);
-	if (!IS_ERR(sb))
-		goto got_sb;
-
-	c = bch2_fs_open(devs.data, devs.nr, opts);
-	if (IS_ERR(c)) {
-		sb = ERR_CAST(c);
-		goto got_sb;
-	}
-
-	/* Some options can't be parsed until after the fs is started: */
-	ret = bch2_parse_mount_opts(c, &opts, data);
-	if (ret) {
-		bch2_fs_stop(c);
-		sb = ERR_PTR(ret);
-		goto got_sb;
-	}
-
-	bch2_opts_apply(&c->opts, opts);
-
-	sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
-	if (IS_ERR(sb))
-		bch2_fs_stop(c);
-got_sb:
-	darray_exit(&devs_to_fs);
-	bch2_darray_str_exit(&devs);
-
-	if (IS_ERR(sb)) {
-		ret = PTR_ERR(sb);
-		ret = bch2_err_class(ret);
-		return ERR_PTR(ret);
-	}
-
-	c = sb->s_fs_info;
-
-	if (sb->s_root) {
-		if ((flags ^ sb->s_flags) & SB_RDONLY) {
-			ret = -EBUSY;
-			goto err_put_super;
-		}
-		goto out;
-	}
-
-	sb->s_blocksize		= block_bytes(c);
-	sb->s_blocksize_bits	= ilog2(block_bytes(c));
-	sb->s_maxbytes		= MAX_LFS_FILESIZE;
-	sb->s_op		= &bch_super_operations;
-	sb->s_export_op		= &bch_export_ops;
-#ifdef CONFIG_BCACHEFS_QUOTA
-	sb->s_qcop		= &bch2_quotactl_operations;
-	sb->s_quota_types	= QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
-#endif
-	sb->s_xattr		= bch2_xattr_handlers;
-	sb->s_magic		= BCACHEFS_STATFS_MAGIC;
-	sb->s_time_gran		= c->sb.nsec_per_time_unit;
-	sb->s_time_min		= div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
-	sb->s_time_max		= div_s64(S64_MAX, c->sb.time_units_per_sec);
-	sb->s_uuid		= c->sb.user_uuid;
-	c->vfs_sb		= sb;
-	strscpy(sb->s_id, c->name, sizeof(sb->s_id));
-
-	ret = super_setup_bdi(sb);
-	if (ret)
-		goto err_put_super;
-
-	sb->s_bdi->ra_pages		= VM_READAHEAD_PAGES;
-
-	for_each_online_member(c, ca) {
-		struct block_device *bdev = ca->disk_sb.bdev;
-
-		/* XXX: create an anonymous device for multi device filesystems */
-		sb->s_bdev	= bdev;
-		sb->s_dev	= bdev->bd_dev;
-		percpu_ref_put(&ca->io_ref);
-		break;
-	}
-
-	c->dev = sb->s_dev;
-
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-	if (c->opts.acl)
-		sb->s_flags	|= SB_POSIXACL;
-#endif
-
-	sb->s_shrink->seeks = 0;
-
-	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
-	ret = PTR_ERR_OR_ZERO(vinode);
-	bch_err_msg(c, ret, "mounting: error getting root inode");
-	if (ret)
-		goto err_put_super;
-
-	sb->s_root = d_make_root(vinode);
-	if (!sb->s_root) {
-		bch_err(c, "error mounting: error allocating root dentry");
-		ret = -ENOMEM;
-		goto err_put_super;
-	}
-
-	sb->s_flags |= SB_ACTIVE;
-out:
-	return dget(sb->s_root);
-
-err_put_super:
-	__bch2_fs_stop(c);
-	deactivate_locked_super(sb);
-	return ERR_PTR(bch2_err_class(ret));
-}
-
-static void bch2_kill_sb(struct super_block *sb)
-{
-	struct bch_fs *c = sb->s_fs_info;
-
-	generic_shutdown_super(sb);
-	bch2_fs_free(c);
-}
-
-static struct file_system_type bcache_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "bcachefs",
-	.mount		= bch2_mount,
-	.kill_sb	= bch2_kill_sb,
-	.fs_flags	= FS_REQUIRES_DEV,
-};
-
-MODULE_ALIAS_FS("bcachefs");
-
-void bch2_vfs_exit(void)
-{
-	unregister_filesystem(&bcache_fs_type);
-	kmem_cache_destroy(bch2_inode_cache);
-}
-
-int __init bch2_vfs_init(void)
-{
-	int ret = -ENOMEM;
-
-	bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT);
-	if (!bch2_inode_cache)
-		goto err;
-
-	ret = register_filesystem(&bcache_fs_type);
-	if (ret)
-		goto err;
-
-	return 0;
-err:
-	bch2_vfs_exit();
-	return ret;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
deleted file mode 100644
index c3af7225ff69..000000000000
--- a/fs/bcachefs/fs.h
+++ /dev/null
@@ -1,204 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_H
-#define _BCACHEFS_FS_H
-
-#include "inode.h"
-#include "opts.h"
-#include "str_hash.h"
-#include "quota_types.h"
-#include "two_state_shared_lock.h"
-
-#include <linux/seqlock.h>
-#include <linux/stat.h>
-
-struct bch_inode_info {
-	struct inode		v;
-	struct list_head	ei_vfs_inode_list;
-	unsigned long		ei_flags;
-
-	struct mutex		ei_update_lock;
-	u64			ei_quota_reserved;
-	unsigned long		ei_last_dirtied;
-	two_state_lock_t	ei_pagecache_lock;
-
-	struct mutex		ei_quota_lock;
-	struct bch_qid		ei_qid;
-
-	u32			ei_subvol;
-
-	/*
-	 * When we've been doing nocow writes we'll need to issue flushes to the
-	 * underlying block devices
-	 *
-	 * XXX: a device may have had a flush issued by some other codepath. It
-	 * would be better to keep for each device a sequence number that's
-	 * incremented when we isusue a cache flush, and track here the sequence
-	 * number that needs flushing.
-	 */
-	struct bch_devs_mask	ei_devs_need_flush;
-
-	/* copy of inode in btree: */
-	struct bch_inode_unpacked ei_inode;
-};
-
-#define bch2_pagecache_add_put(i)	bch2_two_state_unlock(&i->ei_pagecache_lock, 0)
-#define bch2_pagecache_add_tryget(i)	bch2_two_state_trylock(&i->ei_pagecache_lock, 0)
-#define bch2_pagecache_add_get(i)	bch2_two_state_lock(&i->ei_pagecache_lock, 0)
-
-#define bch2_pagecache_block_put(i)	bch2_two_state_unlock(&i->ei_pagecache_lock, 1)
-#define bch2_pagecache_block_get(i)	bch2_two_state_lock(&i->ei_pagecache_lock, 1)
-
-static inline subvol_inum inode_inum(struct bch_inode_info *inode)
-{
-	return (subvol_inum) {
-		.subvol	= inode->ei_subvol,
-		.inum	= inode->ei_inode.bi_inum,
-	};
-}
-
-/*
- * Set if we've gotten a btree error for this inode, and thus the vfs inode and
- * btree inode may be inconsistent:
- */
-#define EI_INODE_ERROR			0
-
-/*
- * Set in the inode is in a snapshot subvolume - we don't do quota accounting in
- * those:
- */
-#define EI_INODE_SNAPSHOT		1
-
-#define to_bch_ei(_inode)					\
-	container_of_or_null(_inode, struct bch_inode_info, v)
-
-static inline int ptrcmp(void *l, void *r)
-{
-	return cmp_int(l, r);
-}
-
-enum bch_inode_lock_op {
-	INODE_PAGECACHE_BLOCK	= (1U << 0),
-	INODE_UPDATE_LOCK	= (1U << 1),
-};
-
-#define bch2_lock_inodes(_locks, ...)					\
-do {									\
-	struct bch_inode_info *a[] = { NULL, __VA_ARGS__ };		\
-	unsigned i;							\
-									\
-	bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp);			\
-									\
-	for (i = 1; i < ARRAY_SIZE(a); i++)				\
-		if (a[i] != a[i - 1]) {					\
-			if ((_locks) & INODE_PAGECACHE_BLOCK)		\
-				bch2_pagecache_block_get(a[i]);\
-			if ((_locks) & INODE_UPDATE_LOCK)			\
-				mutex_lock_nested(&a[i]->ei_update_lock, i);\
-		}							\
-} while (0)
-
-#define bch2_unlock_inodes(_locks, ...)					\
-do {									\
-	struct bch_inode_info *a[] = { NULL, __VA_ARGS__ };		\
-	unsigned i;							\
-									\
-	bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp);			\
-									\
-	for (i = 1; i < ARRAY_SIZE(a); i++)				\
-		if (a[i] != a[i - 1]) {					\
-			if ((_locks) & INODE_PAGECACHE_BLOCK)		\
-				bch2_pagecache_block_put(a[i]);\
-			if ((_locks) & INODE_UPDATE_LOCK)			\
-				mutex_unlock(&a[i]->ei_update_lock);	\
-		}							\
-} while (0)
-
-static inline struct bch_inode_info *file_bch_inode(struct file *file)
-{
-	return to_bch_ei(file_inode(file));
-}
-
-static inline bool inode_attr_changing(struct bch_inode_info *dir,
-				struct bch_inode_info *inode,
-				enum inode_opt_id id)
-{
-	return !(inode->ei_inode.bi_fields_set & (1 << id)) &&
-		bch2_inode_opt_get(&dir->ei_inode, id) !=
-		bch2_inode_opt_get(&inode->ei_inode, id);
-}
-
-static inline bool inode_attrs_changing(struct bch_inode_info *dir,
-				 struct bch_inode_info *inode)
-{
-	unsigned id;
-
-	for (id = 0; id < Inode_opt_nr; id++)
-		if (inode_attr_changing(dir, inode, id))
-			return true;
-
-	return false;
-}
-
-struct bch_inode_unpacked;
-
-#ifndef NO_BCACHEFS_FS
-
-struct bch_inode_info *
-__bch2_create(struct mnt_idmap *, struct bch_inode_info *,
-	      struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
-
-int bch2_fs_quota_transfer(struct bch_fs *,
-			   struct bch_inode_info *,
-			   struct bch_qid,
-			   unsigned,
-			   enum quota_acct_mode);
-
-static inline int bch2_set_projid(struct bch_fs *c,
-				  struct bch_inode_info *inode,
-				  u32 projid)
-{
-	struct bch_qid qid = inode->ei_qid;
-
-	qid.q[QTYP_PRJ] = projid;
-
-	return bch2_fs_quota_transfer(c, inode, qid,
-				      1 << QTYP_PRJ,
-				      KEY_TYPE_QUOTA_PREALLOC);
-}
-
-struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum);
-
-/* returns 0 if we want to do the update, or error is passed up */
-typedef int (*inode_set_fn)(struct btree_trans *,
-			    struct bch_inode_info *,
-			    struct bch_inode_unpacked *, void *);
-
-void bch2_inode_update_after_write(struct btree_trans *,
-				   struct bch_inode_info *,
-				   struct bch_inode_unpacked *,
-				   unsigned);
-int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
-				  inode_set_fn, void *, unsigned);
-
-int bch2_setattr_nonsize(struct mnt_idmap *,
-			 struct bch_inode_info *,
-			 struct iattr *);
-int __bch2_unlink(struct inode *, struct dentry *, bool);
-
-void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
-
-void bch2_vfs_exit(void);
-int bch2_vfs_init(void);
-
-#else
-
-#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields)	({ do {} while (0); })
-
-static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
-					       snapshot_id_list *s) {}
-static inline void bch2_vfs_exit(void) {}
-static inline int bch2_vfs_init(void) { return 0; }
-
-#endif /* NO_BCACHEFS_FS */
-
-#endif /* _BCACHEFS_FS_H */
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
deleted file mode 100644
index c8f57465131c..000000000000
--- a/fs/bcachefs/fsck.c
+++ /dev/null
@@ -1,2819 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "darray.h"
-#include "dirent.h"
-#include "error.h"
-#include "fs-common.h"
-#include "fsck.h"
-#include "inode.h"
-#include "keylist.h"
-#include "recovery_passes.h"
-#include "snapshot.h"
-#include "super.h"
-#include "xattr.h"
-
-#include <linux/bsearch.h>
-#include <linux/dcache.h> /* struct qstr */
-
-/*
- * XXX: this is handling transaction restarts without returning
- * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
- */
-static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
-				    u32 snapshot)
-{
-	u64 sectors = 0;
-
-	int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
-				SPOS(inum, 0, snapshot),
-				POS(inum, U64_MAX),
-				0, k, ({
-		if (bkey_extent_is_allocation(k.k))
-			sectors += k.k->size;
-		0;
-	}));
-
-	return ret ?: sectors;
-}
-
-static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
-				    u32 snapshot)
-{
-	u64 subdirs = 0;
-
-	int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
-				    SPOS(inum, 0, snapshot),
-				    POS(inum, U64_MAX),
-				    0, k, ({
-		if (k.k->type == KEY_TYPE_dirent &&
-		    bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
-			subdirs++;
-		0;
-	}));
-
-	return ret ?: subdirs;
-}
-
-static int subvol_lookup(struct btree_trans *trans, u32 subvol,
-			 u32 *snapshot, u64 *inum)
-{
-	struct bch_subvolume s;
-	int ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
-
-	*snapshot = le32_to_cpu(s.snapshot);
-	*inum = le64_to_cpu(s.inode);
-	return ret;
-}
-
-static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
-			      struct bch_inode_unpacked *inode)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
-			     POS(0, inode_nr),
-			     BTREE_ITER_all_snapshots);
-	k = bch2_btree_iter_peek(&iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) {
-		ret = -BCH_ERR_ENOENT_inode;
-		goto err;
-	}
-
-	ret = bch2_inode_unpack(k, inode);
-err:
-	bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
-			struct bch_inode_unpacked *inode,
-			u32 *snapshot)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
-			       SPOS(0, inode_nr, *snapshot), 0);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	ret = bkey_is_inode(k.k)
-		? bch2_inode_unpack(k, inode)
-		: -BCH_ERR_ENOENT_inode;
-	if (!ret)
-		*snapshot = iter.pos.snapshot;
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int lookup_dirent_in_snapshot(struct btree_trans *trans,
-			   struct bch_hash_info hash_info,
-			   subvol_inum dir, struct qstr *name,
-			   u64 *target, unsigned *type, u32 snapshot)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
-							 &hash_info, dir, name, 0, snapshot);
-	int ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
-	*target = le64_to_cpu(d.v->d_inum);
-	*type = d.v->d_type;
-	bch2_trans_iter_exit(trans, &iter);
-	return 0;
-}
-
-static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bch_inode_unpacked dir_inode;
-	struct bch_hash_info dir_hash_info;
-	int ret;
-
-	ret = lookup_first_inode(trans, pos.inode, &dir_inode);
-	if (ret)
-		goto err;
-
-	dir_hash_info = bch2_hash_info_init(c, &dir_inode);
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
-
-	ret =   bch2_btree_iter_traverse(&iter) ?:
-		bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
-				    &dir_hash_info, &iter,
-				    BTREE_UPDATE_internal_snapshot_node);
-	bch2_trans_iter_exit(trans, &iter);
-err:
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-/* Get lost+found, create if it doesn't exist: */
-static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
-			    struct bch_inode_unpacked *lostfound,
-			    u64 reattaching_inum)
-{
-	struct bch_fs *c = trans->c;
-	struct qstr lostfound_str = QSTR("lost+found");
-	u64 inum = 0;
-	unsigned d_type = 0;
-	int ret;
-
-	struct bch_snapshot_tree st;
-	ret = bch2_snapshot_tree_lookup(trans,
-			bch2_snapshot_tree(c, snapshot), &st);
-	if (ret)
-		return ret;
-
-	subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
-
-	struct bch_subvolume subvol;
-	ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol),
-				 false, 0, &subvol);
-	bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u",
-		    le32_to_cpu(st.master_subvol), snapshot);
-	if (ret)
-		return ret;
-
-	if (!subvol.inode) {
-		struct btree_iter iter;
-		struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
-				BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)),
-				0, subvolume);
-		ret = PTR_ERR_OR_ZERO(subvol);
-		if (ret)
-			return ret;
-
-		subvol->v.inode = cpu_to_le64(reattaching_inum);
-		bch2_trans_iter_exit(trans, &iter);
-	}
-
-	root_inum.inum = le64_to_cpu(subvol.inode);
-
-	struct bch_inode_unpacked root_inode;
-	struct bch_hash_info root_hash_info;
-	u32 root_inode_snapshot = snapshot;
-	ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
-	bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
-		    root_inum.inum, le32_to_cpu(st.master_subvol));
-	if (ret)
-		return ret;
-
-	root_hash_info = bch2_hash_info_init(c, &root_inode);
-
-	ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
-			      &lostfound_str, &inum, &d_type, snapshot);
-	if (bch2_err_matches(ret, ENOENT))
-		goto create_lostfound;
-
-	bch_err_fn(c, ret);
-	if (ret)
-		return ret;
-
-	if (d_type != DT_DIR) {
-		bch_err(c, "error looking up lost+found: not a directory");
-		return -BCH_ERR_ENOENT_not_directory;
-	}
-
-	/*
-	 * The bch2_check_dirents pass has already run, dangling dirents
-	 * shouldn't exist here:
-	 */
-	ret = lookup_inode(trans, inum, lostfound, &snapshot);
-	bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
-		    inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
-	return ret;
-
-create_lostfound:
-	/*
-	 * XXX: we could have a nicer log message here  if we had a nice way to
-	 * walk backpointers to print a path
-	 */
-	bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot));
-
-	u64 now = bch2_current_time(c);
-	struct btree_iter lostfound_iter = { NULL };
-	u64 cpu = raw_smp_processor_id();
-
-	bch2_inode_init_early(c, lostfound);
-	bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
-	lostfound->bi_dir = root_inode.bi_inum;
-
-	root_inode.bi_nlink++;
-
-	ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
-	if (ret)
-		goto err;
-
-	bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
-	ret = bch2_btree_iter_traverse(&lostfound_iter);
-	if (ret)
-		goto err;
-
-	ret =   bch2_dirent_create_snapshot(trans,
-				0, root_inode.bi_inum, snapshot, &root_hash_info,
-				mode_to_type(lostfound->bi_mode),
-				&lostfound_str,
-				lostfound->bi_inum,
-				&lostfound->bi_dir_offset,
-				STR_HASH_must_create) ?:
-		bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
-				       BTREE_UPDATE_internal_snapshot_node);
-err:
-	bch_err_msg(c, ret, "creating lost+found");
-	bch2_trans_iter_exit(trans, &lostfound_iter);
-	return ret;
-}
-
-static int reattach_inode(struct btree_trans *trans,
-			  struct bch_inode_unpacked *inode,
-			  u32 inode_snapshot)
-{
-	struct bch_hash_info dir_hash;
-	struct bch_inode_unpacked lostfound;
-	char name_buf[20];
-	struct qstr name;
-	u64 dir_offset = 0;
-	u32 dirent_snapshot = inode_snapshot;
-	int ret;
-
-	if (inode->bi_subvol) {
-		inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
-
-		u64 root_inum;
-		ret = subvol_lookup(trans, inode->bi_parent_subvol,
-				    &dirent_snapshot, &root_inum);
-		if (ret)
-			return ret;
-
-		snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
-	} else {
-		snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
-	}
-
-	ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
-	if (ret)
-		return ret;
-
-	if (S_ISDIR(inode->bi_mode)) {
-		lostfound.bi_nlink++;
-
-		ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
-		if (ret)
-			return ret;
-	}
-
-	dir_hash = bch2_hash_info_init(trans->c, &lostfound);
-
-	name = (struct qstr) QSTR(name_buf);
-
-	ret = bch2_dirent_create_snapshot(trans,
-				inode->bi_parent_subvol, lostfound.bi_inum,
-				dirent_snapshot,
-				&dir_hash,
-				inode_d_type(inode),
-				&name,
-				inode->bi_subvol ?: inode->bi_inum,
-				&dir_offset,
-				STR_HASH_must_create);
-	if (ret)
-		return ret;
-
-	inode->bi_dir		= lostfound.bi_inum;
-	inode->bi_dir_offset	= dir_offset;
-
-	return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
-}
-
-static int remove_backpointer(struct btree_trans *trans,
-			      struct bch_inode_unpacked *inode)
-{
-	struct btree_iter iter;
-	struct bkey_s_c_dirent d;
-	int ret;
-
-	d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
-				     POS(inode->bi_dir, inode->bi_dir_offset), 0,
-				     dirent);
-	ret =   bkey_err(d) ?:
-		__remove_dirent(trans, d.k->p);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
-{
-	struct bch_fs *c = trans->c;
-
-	struct bch_inode_unpacked inode;
-	int ret = bch2_inode_find_by_inum_trans(trans,
-				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
-				&inode);
-	if (ret)
-		return ret;
-
-	ret = remove_backpointer(trans, &inode);
-	bch_err_msg(c, ret, "removing dirent");
-	if (ret)
-		return ret;
-
-	ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot));
-	bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
-	return ret;
-}
-
-static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
-{
-	struct bch_fs *c = trans->c;
-
-	if (!bch2_snapshot_is_leaf(c, snapshotid)) {
-		bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
-		return -BCH_ERR_fsck_repair_unimplemented;
-	}
-
-	/*
-	 * If inum isn't set, that means we're being called from check_dirents,
-	 * not check_inodes - the root of this subvolume doesn't exist or we
-	 * would have found it there:
-	 */
-	if (!inum) {
-		struct btree_iter inode_iter = {};
-		struct bch_inode_unpacked new_inode;
-		u64 cpu = raw_smp_processor_id();
-
-		bch2_inode_init_early(c, &new_inode);
-		bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
-
-		new_inode.bi_subvol = subvolid;
-
-		int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
-			  bch2_btree_iter_traverse(&inode_iter) ?:
-			  bch2_inode_write(trans, &inode_iter, &new_inode);
-		bch2_trans_iter_exit(trans, &inode_iter);
-		if (ret)
-			return ret;
-
-		inum = new_inode.bi_inum;
-	}
-
-	bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
-
-	struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
-	int ret = PTR_ERR_OR_ZERO(new_subvol);
-	if (ret)
-		return ret;
-
-	bkey_subvolume_init(&new_subvol->k_i);
-	new_subvol->k.p.offset	= subvolid;
-	new_subvol->v.snapshot	= cpu_to_le32(snapshotid);
-	new_subvol->v.inode	= cpu_to_le64(inum);
-	ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
-	if (ret)
-		return ret;
-
-	struct btree_iter iter;
-	struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
-			BTREE_ID_snapshots, POS(0, snapshotid),
-			0, snapshot);
-	ret = PTR_ERR_OR_ZERO(s);
-	bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
-	if (ret)
-		return ret;
-
-	u32 snapshot_tree = le32_to_cpu(s->v.tree);
-
-	s->v.subvol = cpu_to_le32(subvolid);
-	SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
-	bch2_trans_iter_exit(trans, &iter);
-
-	struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
-			BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
-			0, snapshot_tree);
-	ret = PTR_ERR_OR_ZERO(st);
-	bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
-	if (ret)
-		return ret;
-
-	if (!st->v.master_subvol)
-		st->v.master_subvol = cpu_to_le32(subvolid);
-
-	bch2_trans_iter_exit(trans, &iter);
-	return 0;
-}
-
-static int reconstruct_inode(struct btree_trans *trans, u32 snapshot, u64 inum, u64 size, unsigned mode)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_inode_unpacked new_inode;
-
-	bch2_inode_init_early(c, &new_inode);
-	bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, mode|0755, 0, NULL);
-	new_inode.bi_size = size;
-	new_inode.bi_inum = inum;
-
-	return __bch2_fsck_write_inode(trans, &new_inode, snapshot);
-}
-
-static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 inum)
-{
-	struct btree_iter iter = {};
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
-	struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
-	bch2_trans_iter_exit(trans, &iter);
-	int ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG);
-}
-
-struct snapshots_seen {
-	struct bpos			pos;
-	snapshot_id_list		ids;
-};
-
-static inline void snapshots_seen_exit(struct snapshots_seen *s)
-{
-	darray_exit(&s->ids);
-}
-
-static inline void snapshots_seen_init(struct snapshots_seen *s)
-{
-	memset(s, 0, sizeof(*s));
-}
-
-static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
-{
-	u32 *i;
-	__darray_for_each(s->ids, i) {
-		if (*i == id)
-			return 0;
-		if (*i > id)
-			break;
-	}
-
-	int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
-	if (ret)
-		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
-			s->ids.size);
-	return ret;
-}
-
-static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
-				 enum btree_id btree_id, struct bpos pos)
-{
-	if (!bkey_eq(s->pos, pos))
-		s->ids.nr = 0;
-	s->pos = pos;
-
-	return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
-}
-
-/**
- * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
- * and @ancestor hasn't been overwritten in @seen
- *
- * @c:		filesystem handle
- * @seen:	list of snapshot ids already seen at current position
- * @id:		descendent snapshot id
- * @ancestor:	ancestor snapshot id
- *
- * Returns:	whether key in @ancestor snapshot is visible in @id snapshot
- */
-static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
-				    u32 id, u32 ancestor)
-{
-	ssize_t i;
-
-	EBUG_ON(id > ancestor);
-
-	/* @ancestor should be the snapshot most recently added to @seen */
-	EBUG_ON(ancestor != seen->pos.snapshot);
-	EBUG_ON(ancestor != darray_last(seen->ids));
-
-	if (id == ancestor)
-		return true;
-
-	if (!bch2_snapshot_is_ancestor(c, id, ancestor))
-		return false;
-
-	/*
-	 * We know that @id is a descendant of @ancestor, we're checking if
-	 * we've seen a key that overwrote @ancestor - i.e. also a descendent of
-	 * @ascestor and with @id as a descendent.
-	 *
-	 * But we already know that we're scanning IDs between @id and @ancestor
-	 * numerically, since snapshot ID lists are kept sorted, so if we find
-	 * an id that's an ancestor of @id we're done:
-	 */
-
-	for (i = seen->ids.nr - 2;
-	     i >= 0 && seen->ids.data[i] >= id;
-	     --i)
-		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]))
-			return false;
-
-	return true;
-}
-
-/**
- * ref_visible - given a key with snapshot id @src that points to a key with
- * snapshot id @dst, test whether there is some snapshot in which @dst is
- * visible.
- *
- * @c:		filesystem handle
- * @s:		list of snapshot IDs already seen at @src
- * @src:	snapshot ID of src key
- * @dst:	snapshot ID of dst key
- * Returns:	true if there is some snapshot in which @dst is visible
- *
- * Assumes we're visiting @src keys in natural key order
- */
-static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
-			u32 src, u32 dst)
-{
-	return dst <= src
-		? key_visible_in_snapshot(c, s, dst, src)
-		: bch2_snapshot_is_ancestor(c, src, dst);
-}
-
-static int ref_visible2(struct bch_fs *c,
-			u32 src, struct snapshots_seen *src_seen,
-			u32 dst, struct snapshots_seen *dst_seen)
-{
-	if (dst > src) {
-		swap(dst, src);
-		swap(dst_seen, src_seen);
-	}
-	return key_visible_in_snapshot(c, src_seen, dst, src);
-}
-
-#define for_each_visible_inode(_c, _s, _w, _snapshot, _i)				\
-	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&	\
-	     (_i)->snapshot <= (_snapshot); _i++)					\
-		if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
-
-struct inode_walker_entry {
-	struct bch_inode_unpacked inode;
-	u32			snapshot;
-	bool			seen_this_pos;
-	u64			count;
-};
-
-struct inode_walker {
-	bool				first_this_inode;
-	bool				recalculate_sums;
-	struct bpos			last_pos;
-
-	DARRAY(struct inode_walker_entry) inodes;
-};
-
-static void inode_walker_exit(struct inode_walker *w)
-{
-	darray_exit(&w->inodes);
-}
-
-static struct inode_walker inode_walker_init(void)
-{
-	return (struct inode_walker) { 0, };
-}
-
-static int add_inode(struct bch_fs *c, struct inode_walker *w,
-		     struct bkey_s_c inode)
-{
-	struct bch_inode_unpacked u;
-
-	BUG_ON(bch2_inode_unpack(inode, &u));
-
-	return darray_push(&w->inodes, ((struct inode_walker_entry) {
-		.inode		= u,
-		.snapshot	= inode.k->p.snapshot,
-	}));
-}
-
-static int get_inodes_all_snapshots(struct btree_trans *trans,
-				    struct inode_walker *w, u64 inum)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	w->recalculate_sums = false;
-	w->inodes.nr = 0;
-
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
-				     BTREE_ITER_all_snapshots, k, ret) {
-		if (k.k->p.offset != inum)
-			break;
-
-		if (bkey_is_inode(k.k))
-			add_inode(c, w, k);
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	if (ret)
-		return ret;
-
-	w->first_this_inode = true;
-	return 0;
-}
-
-static struct inode_walker_entry *
-lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
-{
-	bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
-
-	struct inode_walker_entry *i;
-	__darray_for_each(w->inodes, i)
-		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
-			goto found;
-
-	return NULL;
-found:
-	BUG_ON(k.k->p.snapshot > i->snapshot);
-
-	if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
-		struct inode_walker_entry new = *i;
-
-		new.snapshot = k.k->p.snapshot;
-		new.count = 0;
-
-		struct printbuf buf = PRINTBUF;
-		bch2_bkey_val_to_text(&buf, c, k);
-
-		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
-			 "unexpected because we should always update the inode when we update a key in that inode\n"
-			 "%s",
-			 w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
-		printbuf_exit(&buf);
-
-		while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
-			--i;
-
-		size_t pos = i - w->inodes.data;
-		int ret = darray_insert_item(&w->inodes, pos, new);
-		if (ret)
-			return ERR_PTR(ret);
-
-		i = w->inodes.data + pos;
-	}
-
-	return i;
-}
-
-static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
-					     struct inode_walker *w,
-					     struct bkey_s_c k)
-{
-	if (w->last_pos.inode != k.k->p.inode) {
-		int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
-		if (ret)
-			return ERR_PTR(ret);
-	} else if (bkey_cmp(w->last_pos, k.k->p)) {
-		darray_for_each(w->inodes, i)
-			i->seen_this_pos = false;
-	}
-
-	w->last_pos = k.k->p;
-
-	return lookup_inode_for_snapshot(trans->c, w, k);
-}
-
-static int get_visible_inodes(struct btree_trans *trans,
-			      struct inode_walker *w,
-			      struct snapshots_seen *s,
-			      u64 inum)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	w->inodes.nr = 0;
-
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
-			   BTREE_ITER_all_snapshots, k, ret) {
-		if (k.k->p.offset != inum)
-			break;
-
-		if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
-			continue;
-
-		if (bkey_is_inode(k.k))
-			add_inode(c, w, k);
-
-		if (k.k->p.snapshot >= s->pos.snapshot)
-			break;
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	return ret;
-}
-
-static int check_key_has_snapshot(struct btree_trans *trans,
-				  struct btree_iter *iter,
-				  struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
-				bkey_in_missing_snapshot,
-				"key in missing snapshot: %s",
-				(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-		ret = bch2_btree_delete_at(trans, iter,
-					    BTREE_UPDATE_internal_snapshot_node) ?: 1;
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int hash_redo_key(struct btree_trans *trans,
-			 const struct bch_hash_desc desc,
-			 struct bch_hash_info *hash_info,
-			 struct btree_iter *k_iter, struct bkey_s_c k)
-{
-	struct bkey_i *delete;
-	struct bkey_i *tmp;
-
-	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
-	if (IS_ERR(delete))
-		return PTR_ERR(delete);
-
-	tmp = bch2_bkey_make_mut_noupdate(trans, k);
-	if (IS_ERR(tmp))
-		return PTR_ERR(tmp);
-
-	bkey_init(&delete->k);
-	delete->k.p = k_iter->pos;
-	return  bch2_btree_iter_traverse(k_iter) ?:
-		bch2_trans_update(trans, k_iter, delete, 0) ?:
-		bch2_hash_set_in_snapshot(trans, desc, hash_info,
-				       (subvol_inum) { 0, k.k->p.inode },
-				       k.k->p.snapshot, tmp,
-				       STR_HASH_must_create|
-				       BTREE_UPDATE_internal_snapshot_node) ?:
-		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-}
-
-static int hash_check_key(struct btree_trans *trans,
-			  const struct bch_hash_desc desc,
-			  struct bch_hash_info *hash_info,
-			  struct btree_iter *k_iter, struct bkey_s_c hash_k)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter = { NULL };
-	struct printbuf buf = PRINTBUF;
-	struct bkey_s_c k;
-	u64 hash;
-	int ret = 0;
-
-	if (hash_k.k->type != desc.key_type)
-		return 0;
-
-	hash = desc.hash_bkey(hash_info, hash_k);
-
-	if (likely(hash == hash_k.k->p.offset))
-		return 0;
-
-	if (hash_k.k->p.offset < hash)
-		goto bad_hash;
-
-	for_each_btree_key_norestart(trans, iter, desc.btree_id,
-				     SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
-				     BTREE_ITER_slots, k, ret) {
-		if (bkey_eq(k.k->p, hash_k.k->p))
-			break;
-
-		if (fsck_err_on(k.k->type == desc.key_type &&
-				!desc.cmp_bkey(k, hash_k), c,
-				hash_table_key_duplicate,
-				"duplicate hash table keys:\n%s",
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, hash_k),
-				 buf.buf))) {
-			ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
-			break;
-		}
-
-		if (bkey_deleted(k.k)) {
-			bch2_trans_iter_exit(trans, &iter);
-			goto bad_hash;
-		}
-	}
-out:
-	bch2_trans_iter_exit(trans, &iter);
-	printbuf_exit(&buf);
-	return ret;
-bad_hash:
-	if (fsck_err(c, hash_table_key_wrong_offset,
-		     "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
-		     bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
-		     (printbuf_reset(&buf),
-		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
-		ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
-		bch_err_fn(c, ret);
-		if (ret)
-			return ret;
-		ret = -BCH_ERR_transaction_restart_nested;
-	}
-fsck_err:
-	goto out;
-}
-
-static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
-						struct btree_iter *iter,
-						struct bpos pos)
-{
-	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
-}
-
-static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
-					       struct btree_iter *iter,
-					       struct bch_inode_unpacked *inode,
-					       u32 *snapshot)
-{
-	if (inode->bi_subvol) {
-		u64 inum;
-		int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
-		if (ret)
-			return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
-	}
-
-	return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
-}
-
-static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
-				   struct bkey_s_c_dirent d)
-{
-	return  inode->bi_dir		== d.k->p.inode &&
-		inode->bi_dir_offset	== d.k->p.offset;
-}
-
-static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
-				   struct bch_inode_unpacked *inode)
-{
-	return d.v->d_type == DT_SUBVOL
-		? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
-		: le64_to_cpu(d.v->d_inum)		== inode->bi_inum;
-}
-
-static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
-	int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k,
-				    struct bch_inode_unpacked *inode,
-				    u32 inode_snapshot, bool *write_inode)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-
-	struct btree_iter dirent_iter = {};
-	struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
-	int ret = bkey_err(d);
-	if (ret && !bch2_err_matches(ret, ENOENT))
-		return ret;
-
-	if (fsck_err_on(ret,
-			c, inode_points_to_missing_dirent,
-			"inode points to missing dirent\n%s",
-			(bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
-	    fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
-			c, inode_points_to_wrong_dirent,
-			"inode points to dirent that does not point back:\n%s",
-			(bch2_bkey_val_to_text(&buf, c, inode_k),
-			 prt_newline(&buf),
-			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
-		/*
-		 * We just clear the backpointer fields for now. If we find a
-		 * dirent that points to this inode in check_dirents(), we'll
-		 * update it then; then when we get to check_path() if the
-		 * backpointer is still 0 we'll reattach it.
-		 */
-		inode->bi_dir = 0;
-		inode->bi_dir_offset = 0;
-		inode->bi_flags &= ~BCH_INODE_backptr_untrusted;
-		*write_inode = true;
-	}
-
-	ret = 0;
-fsck_err:
-	bch2_trans_iter_exit(trans, &dirent_iter);
-	printbuf_exit(&buf);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int check_inode(struct btree_trans *trans,
-		       struct btree_iter *iter,
-		       struct bkey_s_c k,
-		       struct bch_inode_unpacked *prev,
-		       struct snapshots_seen *s,
-		       bool full)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_inode_unpacked u;
-	bool do_update = false;
-	int ret;
-
-	ret = check_key_has_snapshot(trans, iter, k);
-	if (ret < 0)
-		goto err;
-	if (ret)
-		return 0;
-
-	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
-	if (ret)
-		goto err;
-
-	if (!bkey_is_inode(k.k))
-		return 0;
-
-	BUG_ON(bch2_inode_unpack(k, &u));
-
-	if (!full &&
-	    !(u.bi_flags & (BCH_INODE_i_size_dirty|
-			    BCH_INODE_i_sectors_dirty|
-			    BCH_INODE_unlinked)))
-		return 0;
-
-	if (prev->bi_inum != u.bi_inum)
-		*prev = u;
-
-	if (fsck_err_on(prev->bi_hash_seed	!= u.bi_hash_seed ||
-			inode_d_type(prev)	!= inode_d_type(&u),
-			c, inode_snapshot_mismatch,
-			"inodes in different snapshots don't match")) {
-		bch_err(c, "repair not implemented yet");
-		return -BCH_ERR_fsck_repair_unimplemented;
-	}
-
-	if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
-	    bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
-		struct bpos new_min_pos;
-
-		ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
-		if (ret)
-			goto err;
-
-		u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
-
-		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
-
-		bch_err_msg(c, ret, "in fsck updating inode");
-		if (ret)
-			return ret;
-
-		if (!bpos_eq(new_min_pos, POS_MIN))
-			bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
-		return 0;
-	}
-
-	if (u.bi_flags & BCH_INODE_unlinked) {
-		ret = check_inode_deleted_list(trans, k.k->p);
-		if (ret < 0)
-			return ret;
-
-		fsck_err_on(!ret, c, unlinked_inode_not_on_deleted_list,
-			    "inode %llu:%u unlinked, but not on deleted list",
-			    u.bi_inum, k.k->p.snapshot);
-		ret = 0;
-	}
-
-	if (u.bi_flags & BCH_INODE_unlinked &&
-	    (!c->sb.clean ||
-	     fsck_err(c, inode_unlinked_but_clean,
-		      "filesystem marked clean, but inode %llu unlinked",
-		      u.bi_inum))) {
-		ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
-		bch_err_msg(c, ret, "in fsck deleting inode");
-		return ret;
-	}
-
-	if (u.bi_flags & BCH_INODE_i_size_dirty &&
-	    (!c->sb.clean ||
-	     fsck_err(c, inode_i_size_dirty_but_clean,
-		      "filesystem marked clean, but inode %llu has i_size dirty",
-		      u.bi_inum))) {
-		bch_verbose(c, "truncating inode %llu", u.bi_inum);
-
-		/*
-		 * XXX: need to truncate partial blocks too here - or ideally
-		 * just switch units to bytes and that issue goes away
-		 */
-		ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
-				SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
-				     iter->pos.snapshot),
-				POS(u.bi_inum, U64_MAX),
-				0, NULL);
-		bch_err_msg(c, ret, "in fsck truncating inode");
-		if (ret)
-			return ret;
-
-		/*
-		 * We truncated without our normal sector accounting hook, just
-		 * make sure we recalculate it:
-		 */
-		u.bi_flags |= BCH_INODE_i_sectors_dirty;
-
-		u.bi_flags &= ~BCH_INODE_i_size_dirty;
-		do_update = true;
-	}
-
-	if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
-	    (!c->sb.clean ||
-	     fsck_err(c, inode_i_sectors_dirty_but_clean,
-		      "filesystem marked clean, but inode %llu has i_sectors dirty",
-		      u.bi_inum))) {
-		s64 sectors;
-
-		bch_verbose(c, "recounting sectors for inode %llu",
-			    u.bi_inum);
-
-		sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
-		if (sectors < 0) {
-			bch_err_msg(c, sectors, "in fsck recounting inode sectors");
-			return sectors;
-		}
-
-		u.bi_sectors = sectors;
-		u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
-		do_update = true;
-	}
-
-	if (u.bi_flags & BCH_INODE_backptr_untrusted) {
-		u.bi_dir = 0;
-		u.bi_dir_offset = 0;
-		u.bi_flags &= ~BCH_INODE_backptr_untrusted;
-		do_update = true;
-	}
-
-	if (u.bi_dir || u.bi_dir_offset) {
-		ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update);
-		if (ret)
-			goto err;
-	}
-
-	if (fsck_err_on(u.bi_parent_subvol &&
-			(u.bi_subvol == 0 ||
-			 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
-			c, inode_bi_parent_nonzero,
-			"inode %llu:%u has subvol %u but nonzero parent subvol %u",
-			u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
-		u.bi_parent_subvol = 0;
-		do_update = true;
-	}
-
-	if (u.bi_subvol) {
-		struct bch_subvolume s;
-
-		ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
-		if (ret && !bch2_err_matches(ret, ENOENT))
-			goto err;
-
-		if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
-			ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
-			goto do_update;
-		}
-
-		if (fsck_err_on(ret,
-				c, inode_bi_subvol_missing,
-				"inode %llu:%u bi_subvol points to missing subvolume %u",
-				u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
-		    fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
-				!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
-							   k.k->p.snapshot),
-				c, inode_bi_subvol_wrong,
-				"inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
-				u.bi_inum, k.k->p.snapshot, u.bi_subvol,
-				le64_to_cpu(s.inode),
-				le32_to_cpu(s.snapshot))) {
-			u.bi_subvol = 0;
-			u.bi_parent_subvol = 0;
-			do_update = true;
-		}
-	}
-do_update:
-	if (do_update) {
-		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
-		bch_err_msg(c, ret, "in fsck updating inode");
-		if (ret)
-			return ret;
-	}
-err:
-fsck_err:
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-int bch2_check_inodes(struct bch_fs *c)
-{
-	bool full = c->opts.fsck;
-	struct bch_inode_unpacked prev = { 0 };
-	struct snapshots_seen s;
-
-	snapshots_seen_init(&s);
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
-				POS_MIN,
-				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			check_inode(trans, &iter, k, &prev, &s, full)));
-
-	snapshots_seen_exit(&s);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
-{
-	struct bch_fs *c = trans->c;
-	int ret = 0;
-	s64 count2;
-
-	darray_for_each(w->inodes, i) {
-		if (i->inode.bi_sectors == i->count)
-			continue;
-
-		count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
-
-		if (w->recalculate_sums)
-			i->count = count2;
-
-		if (i->count != count2) {
-			bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
-					    w->last_pos.inode, i->snapshot, i->count, count2);
-			return -BCH_ERR_internal_fsck_err;
-		}
-
-		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
-				c, inode_i_sectors_wrong,
-				"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
-				w->last_pos.inode, i->snapshot,
-				i->inode.bi_sectors, i->count)) {
-			i->inode.bi_sectors = i->count;
-			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
-			if (ret)
-				break;
-		}
-	}
-fsck_err:
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
-{
-	u32 restart_count = trans->restart_count;
-	return check_i_sectors_notnested(trans, w) ?:
-		trans_was_restarted(trans, restart_count);
-}
-
-struct extent_end {
-	u32			snapshot;
-	u64			offset;
-	struct snapshots_seen	seen;
-};
-
-struct extent_ends {
-	struct bpos			last_pos;
-	DARRAY(struct extent_end)	e;
-};
-
-static void extent_ends_reset(struct extent_ends *extent_ends)
-{
-	darray_for_each(extent_ends->e, i)
-		snapshots_seen_exit(&i->seen);
-	extent_ends->e.nr = 0;
-}
-
-static void extent_ends_exit(struct extent_ends *extent_ends)
-{
-	extent_ends_reset(extent_ends);
-	darray_exit(&extent_ends->e);
-}
-
-static void extent_ends_init(struct extent_ends *extent_ends)
-{
-	memset(extent_ends, 0, sizeof(*extent_ends));
-}
-
-static int extent_ends_at(struct bch_fs *c,
-			  struct extent_ends *extent_ends,
-			  struct snapshots_seen *seen,
-			  struct bkey_s_c k)
-{
-	struct extent_end *i, n = (struct extent_end) {
-		.offset		= k.k->p.offset,
-		.snapshot	= k.k->p.snapshot,
-		.seen		= *seen,
-	};
-
-	n.seen.ids.data = kmemdup(seen->ids.data,
-			      sizeof(seen->ids.data[0]) * seen->ids.size,
-			      GFP_KERNEL);
-	if (!n.seen.ids.data)
-		return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
-
-	__darray_for_each(extent_ends->e, i) {
-		if (i->snapshot == k.k->p.snapshot) {
-			snapshots_seen_exit(&i->seen);
-			*i = n;
-			return 0;
-		}
-
-		if (i->snapshot >= k.k->p.snapshot)
-			break;
-	}
-
-	return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
-}
-
-static int overlapping_extents_found(struct btree_trans *trans,
-				     enum btree_id btree,
-				     struct bpos pos1, struct snapshots_seen *pos1_seen,
-				     struct bkey pos2,
-				     bool *fixed,
-				     struct extent_end *extent_end)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-	struct btree_iter iter1, iter2 = { NULL };
-	struct bkey_s_c k1, k2;
-	int ret;
-
-	BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
-
-	bch2_trans_iter_init(trans, &iter1, btree, pos1,
-			     BTREE_ITER_all_snapshots|
-			     BTREE_ITER_not_extents);
-	k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
-	ret = bkey_err(k1);
-	if (ret)
-		goto err;
-
-	prt_str(&buf, "\n  ");
-	bch2_bkey_val_to_text(&buf, c, k1);
-
-	if (!bpos_eq(pos1, k1.k->p)) {
-		prt_str(&buf, "\n  wanted\n  ");
-		bch2_bpos_to_text(&buf, pos1);
-		prt_str(&buf, "\n  ");
-		bch2_bkey_to_text(&buf, &pos2);
-
-		bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
-			__func__, buf.buf);
-		ret = -BCH_ERR_internal_fsck_err;
-		goto err;
-	}
-
-	bch2_trans_copy_iter(&iter2, &iter1);
-
-	while (1) {
-		bch2_btree_iter_advance(&iter2);
-
-		k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
-		ret = bkey_err(k2);
-		if (ret)
-			goto err;
-
-		if (bpos_ge(k2.k->p, pos2.p))
-			break;
-	}
-
-	prt_str(&buf, "\n  ");
-	bch2_bkey_val_to_text(&buf, c, k2);
-
-	if (bpos_gt(k2.k->p, pos2.p) ||
-	    pos2.size != k2.k->size) {
-		bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
-			__func__, buf.buf);
-		ret = -BCH_ERR_internal_fsck_err;
-		goto err;
-	}
-
-	prt_printf(&buf, "\n  overwriting %s extent",
-		   pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
-
-	if (fsck_err(c, extent_overlapping,
-		     "overlapping extents%s", buf.buf)) {
-		struct btree_iter *old_iter = &iter1;
-		struct disk_reservation res = { 0 };
-
-		if (pos1.snapshot < pos2.p.snapshot) {
-			old_iter = &iter2;
-			swap(k1, k2);
-		}
-
-		trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
-
-		ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
-				BTREE_UPDATE_internal_snapshot_node,
-				k1, k2) ?:
-			bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
-		bch2_disk_reservation_put(c, &res);
-
-		if (ret)
-			goto err;
-
-		*fixed = true;
-
-		if (pos1.snapshot == pos2.p.snapshot) {
-			/*
-			 * We overwrote the first extent, and did the overwrite
-			 * in the same snapshot:
-			 */
-			extent_end->offset = bkey_start_offset(&pos2);
-		} else if (pos1.snapshot > pos2.p.snapshot) {
-			/*
-			 * We overwrote the first extent in pos2's snapshot:
-			 */
-			ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
-		} else {
-			/*
-			 * We overwrote the second extent - restart
-			 * check_extent() from the top:
-			 */
-			ret = -BCH_ERR_transaction_restart_nested;
-		}
-	}
-fsck_err:
-err:
-	bch2_trans_iter_exit(trans, &iter2);
-	bch2_trans_iter_exit(trans, &iter1);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int check_overlapping_extents(struct btree_trans *trans,
-			      struct snapshots_seen *seen,
-			      struct extent_ends *extent_ends,
-			      struct bkey_s_c k,
-			      struct btree_iter *iter,
-			      bool *fixed)
-{
-	struct bch_fs *c = trans->c;
-	int ret = 0;
-
-	/* transaction restart, running again */
-	if (bpos_eq(extent_ends->last_pos, k.k->p))
-		return 0;
-
-	if (extent_ends->last_pos.inode != k.k->p.inode)
-		extent_ends_reset(extent_ends);
-
-	darray_for_each(extent_ends->e, i) {
-		if (i->offset <= bkey_start_offset(k.k))
-			continue;
-
-		if (!ref_visible2(c,
-				  k.k->p.snapshot, seen,
-				  i->snapshot, &i->seen))
-			continue;
-
-		ret = overlapping_extents_found(trans, iter->btree_id,
-						SPOS(iter->pos.inode,
-						     i->offset,
-						     i->snapshot),
-						&i->seen,
-						*k.k, fixed, i);
-		if (ret)
-			goto err;
-	}
-
-	extent_ends->last_pos = k.k->p;
-err:
-	return ret;
-}
-
-static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
-				struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	struct bch_extent_crc_unpacked crc;
-	const union bch_extent_entry *i;
-	unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
-
-	bkey_for_each_crc(k.k, ptrs, crc, i)
-		if (crc_is_encoded(crc) &&
-		    crc.uncompressed_size > encoded_extent_max_sectors) {
-			struct printbuf buf = PRINTBUF;
-
-			bch2_bkey_val_to_text(&buf, c, k);
-			bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
-			printbuf_exit(&buf);
-		}
-
-	return 0;
-}
-
-static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
-			struct bkey_s_c k,
-			struct inode_walker *inode,
-			struct snapshots_seen *s,
-			struct extent_ends *extent_ends)
-{
-	struct bch_fs *c = trans->c;
-	struct inode_walker_entry *i;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	ret = check_key_has_snapshot(trans, iter, k);
-	if (ret) {
-		ret = ret < 0 ? ret : 0;
-		goto out;
-	}
-
-	if (inode->last_pos.inode != k.k->p.inode) {
-		ret = check_i_sectors(trans, inode);
-		if (ret)
-			goto err;
-	}
-
-	i = walk_inode(trans, inode, k);
-	ret = PTR_ERR_OR_ZERO(i);
-	if (ret)
-		goto err;
-
-	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
-	if (ret)
-		goto err;
-
-	if (k.k->type != KEY_TYPE_whiteout) {
-		if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
-			ret =   reconstruct_reg_inode(trans, k.k->p.snapshot, k.k->p.inode) ?:
-				bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-			if (ret)
-				goto err;
-
-			inode->last_pos.inode--;
-			ret = -BCH_ERR_transaction_restart_nested;
-			goto err;
-		}
-
-		if (fsck_err_on(!i, c, extent_in_missing_inode,
-				"extent in missing inode:\n  %s",
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-			goto delete;
-
-		if (fsck_err_on(i &&
-				!S_ISREG(i->inode.bi_mode) &&
-				!S_ISLNK(i->inode.bi_mode),
-				c, extent_in_non_reg_inode,
-				"extent in non regular inode mode %o:\n  %s",
-				i->inode.bi_mode,
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-			goto delete;
-
-		ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
-						&inode->recalculate_sums);
-		if (ret)
-			goto err;
-	}
-
-	/*
-	 * Check inodes in reverse order, from oldest snapshots to newest,
-	 * starting from the inode that matches this extent's snapshot. If we
-	 * didn't have one, iterate over all inodes:
-	 */
-	if (!i)
-		i = inode->inodes.data + inode->inodes.nr - 1;
-
-	for (;
-	     inode->inodes.data && i >= inode->inodes.data;
-	     --i) {
-		if (i->snapshot > k.k->p.snapshot ||
-		    !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
-			continue;
-
-		if (k.k->type != KEY_TYPE_whiteout) {
-			if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
-					k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
-					!bkey_extent_is_reservation(k),
-					c, extent_past_end_of_inode,
-					"extent type past end of inode %llu:%u, i_size %llu\n  %s",
-					i->inode.bi_inum, i->snapshot, i->inode.bi_size,
-					(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-				struct btree_iter iter2;
-
-				bch2_trans_copy_iter(&iter2, iter);
-				bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
-				ret =   bch2_btree_iter_traverse(&iter2) ?:
-					bch2_btree_delete_at(trans, &iter2,
-						BTREE_UPDATE_internal_snapshot_node);
-				bch2_trans_iter_exit(trans, &iter2);
-				if (ret)
-					goto err;
-
-				iter->k.type = KEY_TYPE_whiteout;
-			}
-
-			if (bkey_extent_is_allocation(k.k))
-				i->count += k.k->size;
-		}
-
-		i->seen_this_pos = true;
-	}
-
-	if (k.k->type != KEY_TYPE_whiteout) {
-		ret = extent_ends_at(c, extent_ends, s, k);
-		if (ret)
-			goto err;
-	}
-out:
-err:
-fsck_err:
-	printbuf_exit(&buf);
-	bch_err_fn(c, ret);
-	return ret;
-delete:
-	ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
-	goto out;
-}
-
-/*
- * Walk extents: verify that extents have a corresponding S_ISREG inode, and
- * that i_size an i_sectors are consistent
- */
-int bch2_check_extents(struct bch_fs *c)
-{
-	struct inode_walker w = inode_walker_init();
-	struct snapshots_seen s;
-	struct extent_ends extent_ends;
-	struct disk_reservation res = { 0 };
-
-	snapshots_seen_init(&s);
-	extent_ends_init(&extent_ends);
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
-				POS(BCACHEFS_ROOT_INO, 0),
-				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-				&res, NULL,
-				BCH_TRANS_COMMIT_no_enospc, ({
-			bch2_disk_reservation_put(c, &res);
-			check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
-			check_extent_overbig(trans, &iter, k);
-		})) ?:
-		check_i_sectors_notnested(trans, &w));
-
-	bch2_disk_reservation_put(c, &res);
-	extent_ends_exit(&extent_ends);
-	inode_walker_exit(&w);
-	snapshots_seen_exit(&s);
-
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-int bch2_check_indirect_extents(struct bch_fs *c)
-{
-	struct disk_reservation res = { 0 };
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
-				POS_MIN,
-				BTREE_ITER_prefetch, k,
-				&res, NULL,
-				BCH_TRANS_COMMIT_no_enospc, ({
-			bch2_disk_reservation_put(c, &res);
-			check_extent_overbig(trans, &iter, k);
-		})));
-
-	bch2_disk_reservation_put(c, &res);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
-{
-	struct bch_fs *c = trans->c;
-	int ret = 0;
-	s64 count2;
-
-	darray_for_each(w->inodes, i) {
-		if (i->inode.bi_nlink == i->count)
-			continue;
-
-		count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
-		if (count2 < 0)
-			return count2;
-
-		if (i->count != count2) {
-			bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
-					    w->last_pos.inode, i->snapshot, i->count, count2);
-			i->count = count2;
-			if (i->inode.bi_nlink == i->count)
-				continue;
-		}
-
-		if (fsck_err_on(i->inode.bi_nlink != i->count,
-				c, inode_dir_wrong_nlink,
-				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
-				w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
-			i->inode.bi_nlink = i->count;
-			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
-			if (ret)
-				break;
-		}
-	}
-fsck_err:
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
-{
-	u32 restart_count = trans->restart_count;
-	return check_subdir_count_notnested(trans, w) ?:
-		trans_was_restarted(trans, restart_count);
-}
-
-static int check_dirent_inode_dirent(struct btree_trans *trans,
-				   struct btree_iter *iter,
-				   struct bkey_s_c_dirent d,
-				   struct bch_inode_unpacked *target,
-				   u32 target_snapshot)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	if (inode_points_to_dirent(target, d))
-		return 0;
-
-	if (bch2_inode_should_have_bp(target) &&
-	    !fsck_err(c, inode_wrong_backpointer,
-		      "dirent points to inode that does not point back:\n  %s",
-		      (bch2_bkey_val_to_text(&buf, c, d.s_c),
-		       prt_printf(&buf, "\n  "),
-		       bch2_inode_unpacked_to_text(&buf, target),
-		       buf.buf)))
-		goto out_noiter;
-
-	if (!target->bi_dir &&
-	    !target->bi_dir_offset) {
-		target->bi_dir		= d.k->p.inode;
-		target->bi_dir_offset	= d.k->p.offset;
-		return __bch2_fsck_write_inode(trans, target, target_snapshot);
-	}
-
-	struct btree_iter bp_iter = { NULL };
-	struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
-			      SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
-	ret = bkey_err(bp_dirent);
-	if (ret && !bch2_err_matches(ret, ENOENT))
-		goto err;
-
-	bool backpointer_exists = !ret;
-	ret = 0;
-
-	if (fsck_err_on(!backpointer_exists,
-			c, inode_wrong_backpointer,
-			"inode %llu:%u has wrong backpointer:\n"
-			"got       %llu:%llu\n"
-			"should be %llu:%llu",
-			target->bi_inum, target_snapshot,
-			target->bi_dir,
-			target->bi_dir_offset,
-			d.k->p.inode,
-			d.k->p.offset)) {
-		target->bi_dir		= d.k->p.inode;
-		target->bi_dir_offset	= d.k->p.offset;
-		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
-		goto out;
-	}
-
-	bch2_bkey_val_to_text(&buf, c, d.s_c);
-	prt_newline(&buf);
-	if (backpointer_exists)
-		bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
-
-	if (fsck_err_on(backpointer_exists &&
-			(S_ISDIR(target->bi_mode) ||
-			 target->bi_subvol),
-			c, inode_dir_multiple_links,
-			"%s %llu:%u with multiple links\n%s",
-			S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
-			target->bi_inum, target_snapshot, buf.buf)) {
-		ret = __remove_dirent(trans, d.k->p);
-		goto out;
-	}
-
-	/*
-	 * hardlinked file with nlink 0:
-	 * We're just adjusting nlink here so check_nlinks() will pick
-	 * it up, it ignores inodes with nlink 0
-	 */
-	if (fsck_err_on(backpointer_exists && !target->bi_nlink,
-			c, inode_multiple_links_but_nlink_0,
-			"inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
-			target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
-		target->bi_nlink++;
-		target->bi_flags &= ~BCH_INODE_unlinked;
-		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
-		if (ret)
-			goto err;
-	}
-out:
-err:
-fsck_err:
-	bch2_trans_iter_exit(trans, &bp_iter);
-out_noiter:
-	printbuf_exit(&buf);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int check_dirent_target(struct btree_trans *trans,
-			       struct btree_iter *iter,
-			       struct bkey_s_c_dirent d,
-			       struct bch_inode_unpacked *target,
-			       u32 target_snapshot)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_i_dirent *n;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot);
-	if (ret)
-		goto err;
-
-	if (fsck_err_on(d.v->d_type != inode_d_type(target),
-			c, dirent_d_type_wrong,
-			"incorrect d_type: got %s, should be %s:\n%s",
-			bch2_d_type_str(d.v->d_type),
-			bch2_d_type_str(inode_d_type(target)),
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
-		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
-		ret = PTR_ERR_OR_ZERO(n);
-		if (ret)
-			goto err;
-
-		bkey_reassemble(&n->k_i, d.s_c);
-		n->v.d_type = inode_d_type(target);
-		if (n->v.d_type == DT_SUBVOL) {
-			n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
-			n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
-		} else {
-			n->v.d_inum = cpu_to_le64(target->bi_inum);
-		}
-
-		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
-		if (ret)
-			goto err;
-
-		d = dirent_i_to_s_c(n);
-	}
-err:
-fsck_err:
-	printbuf_exit(&buf);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-/* find a subvolume that's a descendent of @snapshot: */
-static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
-		if (k.k->type != KEY_TYPE_subvolume)
-			continue;
-
-		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
-		if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
-			bch2_trans_iter_exit(trans, &iter);
-			*subvolid = k.k->p.offset;
-			goto found;
-		}
-	}
-	if (!ret)
-		ret = -ENOENT;
-found:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
-				  struct bkey_s_c_dirent d)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter subvol_iter = {};
-	struct bch_inode_unpacked subvol_root;
-	u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
-	u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
-	u32 parent_snapshot;
-	u32 new_parent_subvol = 0;
-	u64 parent_inum;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
-	if (ret && !bch2_err_matches(ret, ENOENT))
-		return ret;
-
-	if (ret ||
-	    (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
-		int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
-		if (ret2 && !bch2_err_matches(ret, ENOENT))
-			return ret2;
-	}
-
-	if (ret &&
-	    !new_parent_subvol &&
-	    (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
-		/*
-		 * Couldn't find a subvol for dirent's snapshot - but we lost
-		 * subvols, so we need to reconstruct:
-		 */
-		ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
-		if (ret)
-			return ret;
-
-		parent_snapshot = d.k->p.snapshot;
-	}
-
-	if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol,
-			"dirent parent_subvol points to missing subvolume\n%s",
-			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
-	    fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
-			c, dirent_not_visible_in_parent_subvol,
-			"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
-			parent_snapshot,
-			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
-		if (!new_parent_subvol) {
-			bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
-			return -BCH_ERR_fsck_repair_unimplemented;
-		}
-
-		struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
-		ret = PTR_ERR_OR_ZERO(new_dirent);
-		if (ret)
-			goto err;
-
-		new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
-	}
-
-	struct bkey_s_c_subvolume s =
-		bch2_bkey_get_iter_typed(trans, &subvol_iter,
-					 BTREE_ID_subvolumes, POS(0, target_subvol),
-					 0, subvolume);
-	ret = bkey_err(s.s_c);
-	if (ret && !bch2_err_matches(ret, ENOENT))
-		return ret;
-
-	if (ret) {
-		if (fsck_err(c, dirent_to_missing_subvol,
-			     "dirent points to missing subvolume\n%s",
-			     (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
-			return __remove_dirent(trans, d.k->p);
-		ret = 0;
-		goto out;
-	}
-
-	if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
-			c, subvol_fs_path_parent_wrong,
-			"subvol with wrong fs_path_parent, should be be %u\n%s",
-			parent_subvol,
-			(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-		struct bkey_i_subvolume *n =
-			bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
-		ret = PTR_ERR_OR_ZERO(n);
-		if (ret)
-			goto err;
-
-		n->v.fs_path_parent = cpu_to_le32(parent_subvol);
-	}
-
-	u64 target_inum = le64_to_cpu(s.v->inode);
-	u32 target_snapshot = le32_to_cpu(s.v->snapshot);
-
-	ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot);
-	if (ret && !bch2_err_matches(ret, ENOENT))
-		goto err;
-
-	if (ret) {
-		bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
-		ret = -BCH_ERR_fsck_repair_unimplemented;
-		ret = 0;
-		goto err;
-	}
-
-	if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
-			c, inode_bi_parent_wrong,
-			"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
-			target_inum,
-			subvol_root.bi_parent_subvol, parent_subvol)) {
-		subvol_root.bi_parent_subvol = parent_subvol;
-		ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
-		if (ret)
-			goto err;
-	}
-
-	ret = check_dirent_target(trans, iter, d, &subvol_root,
-				  target_snapshot);
-	if (ret)
-		goto err;
-out:
-err:
-fsck_err:
-	bch2_trans_iter_exit(trans, &subvol_iter);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
-			struct bkey_s_c k,
-			struct bch_hash_info *hash_info,
-			struct inode_walker *dir,
-			struct inode_walker *target,
-			struct snapshots_seen *s)
-{
-	struct bch_fs *c = trans->c;
-	struct inode_walker_entry *i;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	ret = check_key_has_snapshot(trans, iter, k);
-	if (ret) {
-		ret = ret < 0 ? ret : 0;
-		goto out;
-	}
-
-	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
-	if (ret)
-		goto err;
-
-	if (k.k->type == KEY_TYPE_whiteout)
-		goto out;
-
-	if (dir->last_pos.inode != k.k->p.inode) {
-		ret = check_subdir_count(trans, dir);
-		if (ret)
-			goto err;
-	}
-
-	BUG_ON(!btree_iter_path(trans, iter)->should_be_locked);
-
-	i = walk_inode(trans, dir, k);
-	ret = PTR_ERR_OR_ZERO(i);
-	if (ret < 0)
-		goto err;
-
-	if (dir->first_this_inode && dir->inodes.nr)
-		*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
-	dir->first_this_inode = false;
-
-	if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
-		ret =   reconstruct_inode(trans, k.k->p.snapshot, k.k->p.inode, 0, S_IFDIR) ?:
-			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-		if (ret)
-			goto err;
-
-		dir->last_pos.inode--;
-		ret = -BCH_ERR_transaction_restart_nested;
-		goto err;
-	}
-
-	if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
-			"dirent in nonexisting directory:\n%s",
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		ret = bch2_btree_delete_at(trans, iter,
-				BTREE_UPDATE_internal_snapshot_node);
-		goto out;
-	}
-
-	if (!i)
-		goto out;
-
-	if (fsck_err_on(!S_ISDIR(i->inode.bi_mode),
-			c, dirent_in_non_dir_inode,
-			"dirent in non directory inode type %s:\n%s",
-			bch2_d_type_str(inode_d_type(&i->inode)),
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		ret = bch2_btree_delete_at(trans, iter, 0);
-		goto out;
-	}
-
-	ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
-	if (ret < 0)
-		goto err;
-	if (ret) {
-		/* dirent has been deleted */
-		ret = 0;
-		goto out;
-	}
-
-	if (k.k->type != KEY_TYPE_dirent)
-		goto out;
-
-	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-
-	if (d.v->d_type == DT_SUBVOL) {
-		ret = check_dirent_to_subvol(trans, iter, d);
-		if (ret)
-			goto err;
-	} else {
-		ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
-		if (ret)
-			goto err;
-
-		if (fsck_err_on(!target->inodes.nr,
-				c, dirent_to_missing_inode,
-				"dirent points to missing inode:\n%s",
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, k),
-				 buf.buf))) {
-			ret = __remove_dirent(trans, d.k->p);
-			if (ret)
-				goto err;
-		}
-
-		darray_for_each(target->inodes, i) {
-			ret = check_dirent_target(trans, iter, d,
-						  &i->inode, i->snapshot);
-			if (ret)
-				goto err;
-		}
-
-		if (d.v->d_type == DT_DIR)
-			for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
-				i->count++;
-	}
-out:
-err:
-fsck_err:
-	printbuf_exit(&buf);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-/*
- * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
- * validate d_type
- */
-int bch2_check_dirents(struct bch_fs *c)
-{
-	struct inode_walker dir = inode_walker_init();
-	struct inode_walker target = inode_walker_init();
-	struct snapshots_seen s;
-	struct bch_hash_info hash_info;
-
-	snapshots_seen_init(&s);
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
-				POS(BCACHEFS_ROOT_INO, 0),
-				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
-				k,
-				NULL, NULL,
-				BCH_TRANS_COMMIT_no_enospc,
-			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
-		check_subdir_count_notnested(trans, &dir));
-
-	snapshots_seen_exit(&s);
-	inode_walker_exit(&dir);
-	inode_walker_exit(&target);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
-		       struct bkey_s_c k,
-		       struct bch_hash_info *hash_info,
-		       struct inode_walker *inode)
-{
-	struct bch_fs *c = trans->c;
-	struct inode_walker_entry *i;
-	int ret;
-
-	ret = check_key_has_snapshot(trans, iter, k);
-	if (ret < 0)
-		return ret;
-	if (ret)
-		return 0;
-
-	i = walk_inode(trans, inode, k);
-	ret = PTR_ERR_OR_ZERO(i);
-	if (ret)
-		return ret;
-
-	if (inode->first_this_inode && inode->inodes.nr)
-		*hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
-	inode->first_this_inode = false;
-
-	if (fsck_err_on(!i, c, xattr_in_missing_inode,
-			"xattr for missing inode %llu",
-			k.k->p.inode))
-		return bch2_btree_delete_at(trans, iter, 0);
-
-	if (!i)
-		return 0;
-
-	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
-fsck_err:
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-/*
- * Walk xattrs: verify that they all have a corresponding inode
- */
-int bch2_check_xattrs(struct bch_fs *c)
-{
-	struct inode_walker inode = inode_walker_init();
-	struct bch_hash_info hash_info;
-	int ret = 0;
-
-	ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
-			POS(BCACHEFS_ROOT_INO, 0),
-			BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
-			k,
-			NULL, NULL,
-			BCH_TRANS_COMMIT_no_enospc,
-		check_xattr(trans, &iter, k, &hash_info, &inode)));
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int check_root_trans(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_inode_unpacked root_inode;
-	u32 snapshot;
-	u64 inum;
-	int ret;
-
-	ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
-	if (ret && !bch2_err_matches(ret, ENOENT))
-		return ret;
-
-	if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
-				"root subvol missing")) {
-		struct bkey_i_subvolume *root_subvol =
-			bch2_trans_kmalloc(trans, sizeof(*root_subvol));
-		ret = PTR_ERR_OR_ZERO(root_subvol);
-		if (ret)
-			goto err;
-
-		snapshot	= U32_MAX;
-		inum		= BCACHEFS_ROOT_INO;
-
-		bkey_subvolume_init(&root_subvol->k_i);
-		root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
-		root_subvol->v.flags	= 0;
-		root_subvol->v.snapshot	= cpu_to_le32(snapshot);
-		root_subvol->v.inode	= cpu_to_le64(inum);
-		ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
-		bch_err_msg(c, ret, "writing root subvol");
-		if (ret)
-			goto err;
-	}
-
-	ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
-	if (ret && !bch2_err_matches(ret, ENOENT))
-		return ret;
-
-	if (mustfix_fsck_err_on(ret, c, root_dir_missing,
-				"root directory missing") ||
-	    mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
-				c, root_inode_not_dir,
-				"root inode not a directory")) {
-		bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
-				0, NULL);
-		root_inode.bi_inum = inum;
-
-		ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
-		bch_err_msg(c, ret, "writing root inode");
-	}
-err:
-fsck_err:
-	return ret;
-}
-
-/* Get root directory, create if it doesn't exist: */
-int bch2_check_root(struct bch_fs *c)
-{
-	int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-		check_root_trans(trans));
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-typedef DARRAY(u32) darray_u32;
-
-static bool darray_u32_has(darray_u32 *d, u32 v)
-{
-	darray_for_each(*d, i)
-		if (*i == v)
-			return true;
-	return false;
-}
-
-/*
- * We've checked that inode backpointers point to valid dirents; here, it's
- * sufficient to check that the subvolume root has a dirent:
- */
-static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s)
-{
-	struct bch_inode_unpacked inode;
-	int ret = bch2_inode_find_by_inum_trans(trans,
-				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
-				&inode);
-	if (ret)
-		return ret;
-
-	return inode.bi_dir != 0;
-}
-
-static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter parent_iter = {};
-	darray_u32 subvol_path = {};
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	if (k.k->type != KEY_TYPE_subvolume)
-		return 0;
-
-	while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
-		ret = darray_push(&subvol_path, k.k->p.offset);
-		if (ret)
-			goto err;
-
-		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
-
-		ret = subvol_has_dirent(trans, s);
-		if (ret < 0)
-			break;
-
-		if (fsck_err_on(!ret,
-				c, subvol_unreachable,
-				"unreachable subvolume %s",
-				(bch2_bkey_val_to_text(&buf, c, s.s_c),
-				 buf.buf))) {
-			ret = reattach_subvol(trans, s);
-			break;
-		}
-
-		u32 parent = le32_to_cpu(s.v->fs_path_parent);
-
-		if (darray_u32_has(&subvol_path, parent)) {
-			if (fsck_err(c, subvol_loop, "subvolume loop"))
-				ret = reattach_subvol(trans, s);
-			break;
-		}
-
-		bch2_trans_iter_exit(trans, &parent_iter);
-		bch2_trans_iter_init(trans, &parent_iter,
-				     BTREE_ID_subvolumes, POS(0, parent), 0);
-		k = bch2_btree_iter_peek_slot(&parent_iter);
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
-
-		if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
-				c, subvol_unreachable,
-				"unreachable subvolume %s",
-				(bch2_bkey_val_to_text(&buf, c, s.s_c),
-				 buf.buf))) {
-			ret = reattach_subvol(trans, s);
-			break;
-		}
-	}
-fsck_err:
-err:
-	printbuf_exit(&buf);
-	darray_exit(&subvol_path);
-	bch2_trans_iter_exit(trans, &parent_iter);
-	return ret;
-}
-
-int bch2_check_subvolume_structure(struct bch_fs *c)
-{
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter,
-				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			check_subvol_path(trans, &iter, k)));
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-struct pathbuf_entry {
-	u64	inum;
-	u32	snapshot;
-};
-
-typedef DARRAY(struct pathbuf_entry) pathbuf;
-
-static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
-{
-	darray_for_each(*p, i)
-		if (i->inum	== inum &&
-		    i->snapshot	== snapshot)
-			return true;
-	return false;
-}
-
-/*
- * Check that a given inode is reachable from its subvolume root - we already
- * verified subvolume connectivity:
- *
- * XXX: we should also be verifying that inodes are in the right subvolumes
- */
-static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter inode_iter = {};
-	struct bch_inode_unpacked inode;
-	struct printbuf buf = PRINTBUF;
-	u32 snapshot = inode_k.k->p.snapshot;
-	int ret = 0;
-
-	p->nr = 0;
-
-	BUG_ON(bch2_inode_unpack(inode_k, &inode));
-
-	while (!inode.bi_subvol) {
-		struct btree_iter dirent_iter;
-		struct bkey_s_c_dirent d;
-		u32 parent_snapshot = snapshot;
-
-		d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
-		ret = bkey_err(d.s_c);
-		if (ret && !bch2_err_matches(ret, ENOENT))
-			break;
-
-		if (!ret && !dirent_points_to_inode(d, &inode)) {
-			bch2_trans_iter_exit(trans, &dirent_iter);
-			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
-		}
-
-		if (bch2_err_matches(ret, ENOENT)) {
-			ret = 0;
-			if (fsck_err(c, inode_unreachable,
-				     "unreachable inode\n%s",
-				     (printbuf_reset(&buf),
-				      bch2_bkey_val_to_text(&buf, c, inode_k),
-				      buf.buf)))
-				ret = reattach_inode(trans, &inode, snapshot);
-			goto out;
-		}
-
-		bch2_trans_iter_exit(trans, &dirent_iter);
-
-		if (!S_ISDIR(inode.bi_mode))
-			break;
-
-		ret = darray_push(p, ((struct pathbuf_entry) {
-			.inum		= inode.bi_inum,
-			.snapshot	= snapshot,
-		}));
-		if (ret)
-			return ret;
-
-		snapshot = parent_snapshot;
-
-		bch2_trans_iter_exit(trans, &inode_iter);
-		inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
-					     SPOS(0, inode.bi_dir, snapshot), 0);
-		ret = bkey_err(inode_k) ?:
-			!bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
-			: bch2_inode_unpack(inode_k, &inode);
-		if (ret) {
-			/* Should have been caught in dirents pass */
-			if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
-				bch_err(c, "error looking up parent directory: %i", ret);
-			break;
-		}
-
-		snapshot = inode_k.k->p.snapshot;
-
-		if (path_is_dup(p, inode.bi_inum, snapshot)) {
-			/* XXX print path */
-			bch_err(c, "directory structure loop");
-
-			darray_for_each(*p, i)
-				pr_err("%llu:%u", i->inum, i->snapshot);
-			pr_err("%llu:%u", inode.bi_inum, snapshot);
-
-			if (fsck_err(c, dir_loop, "directory structure loop")) {
-				ret = remove_backpointer(trans, &inode);
-				bch_err_msg(c, ret, "removing dirent");
-				if (ret)
-					break;
-
-				ret = reattach_inode(trans, &inode, snapshot);
-				bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
-			}
-			break;
-		}
-	}
-out:
-fsck_err:
-	bch2_trans_iter_exit(trans, &inode_iter);
-	printbuf_exit(&buf);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-/*
- * Check for unreachable inodes, as well as loops in the directory structure:
- * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
- * unreachable:
- */
-int bch2_check_directory_structure(struct bch_fs *c)
-{
-	pathbuf path = { 0, };
-	int ret;
-
-	ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
-					  BTREE_ITER_intent|
-					  BTREE_ITER_prefetch|
-					  BTREE_ITER_all_snapshots, k,
-					  NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
-			if (!bkey_is_inode(k.k))
-				continue;
-
-			if (bch2_inode_flags(k) & BCH_INODE_unlinked)
-				continue;
-
-			check_path(trans, &path, k);
-		})));
-	darray_exit(&path);
-
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-struct nlink_table {
-	size_t		nr;
-	size_t		size;
-
-	struct nlink {
-		u64	inum;
-		u32	snapshot;
-		u32	count;
-	}		*d;
-};
-
-static int add_nlink(struct bch_fs *c, struct nlink_table *t,
-		     u64 inum, u32 snapshot)
-{
-	if (t->nr == t->size) {
-		size_t new_size = max_t(size_t, 128UL, t->size * 2);
-		void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
-
-		if (!d) {
-			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
-				new_size);
-			return -BCH_ERR_ENOMEM_fsck_add_nlink;
-		}
-
-		if (t->d)
-			memcpy(d, t->d, t->size * sizeof(t->d[0]));
-		kvfree(t->d);
-
-		t->d = d;
-		t->size = new_size;
-	}
-
-
-	t->d[t->nr++] = (struct nlink) {
-		.inum		= inum,
-		.snapshot	= snapshot,
-	};
-
-	return 0;
-}
-
-static int nlink_cmp(const void *_l, const void *_r)
-{
-	const struct nlink *l = _l;
-	const struct nlink *r = _r;
-
-	return cmp_int(l->inum, r->inum);
-}
-
-static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
-		     struct nlink_table *links,
-		     u64 range_start, u64 range_end, u64 inum, u32 snapshot)
-{
-	struct nlink *link, key = {
-		.inum = inum, .snapshot = U32_MAX,
-	};
-
-	if (inum < range_start || inum >= range_end)
-		return;
-
-	link = __inline_bsearch(&key, links->d, links->nr,
-				sizeof(links->d[0]), nlink_cmp);
-	if (!link)
-		return;
-
-	while (link > links->d && link[0].inum == link[-1].inum)
-		--link;
-
-	for (; link < links->d + links->nr && link->inum == inum; link++)
-		if (ref_visible(c, s, snapshot, link->snapshot)) {
-			link->count++;
-			if (link->snapshot >= snapshot)
-				break;
-		}
-}
-
-noinline_for_stack
-static int check_nlinks_find_hardlinks(struct bch_fs *c,
-				       struct nlink_table *t,
-				       u64 start, u64 *end)
-{
-	int ret = bch2_trans_run(c,
-		for_each_btree_key(trans, iter, BTREE_ID_inodes,
-				   POS(0, start),
-				   BTREE_ITER_intent|
-				   BTREE_ITER_prefetch|
-				   BTREE_ITER_all_snapshots, k, ({
-			if (!bkey_is_inode(k.k))
-				continue;
-
-			/* Should never fail, checked by bch2_inode_invalid: */
-			struct bch_inode_unpacked u;
-			BUG_ON(bch2_inode_unpack(k, &u));
-
-			/*
-			 * Backpointer and directory structure checks are sufficient for
-			 * directories, since they can't have hardlinks:
-			 */
-			if (S_ISDIR(u.bi_mode))
-				continue;
-
-			if (!u.bi_nlink)
-				continue;
-
-			ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
-			if (ret) {
-				*end = k.k->p.offset;
-				ret = 0;
-				break;
-			}
-			0;
-		})));
-
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-noinline_for_stack
-static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
-				     u64 range_start, u64 range_end)
-{
-	struct snapshots_seen s;
-
-	snapshots_seen_init(&s);
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
-				   BTREE_ITER_intent|
-				   BTREE_ITER_prefetch|
-				   BTREE_ITER_all_snapshots, k, ({
-			ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
-			if (ret)
-				break;
-
-			if (k.k->type == KEY_TYPE_dirent) {
-				struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-
-				if (d.v->d_type != DT_DIR &&
-				    d.v->d_type != DT_SUBVOL)
-					inc_link(c, &s, links, range_start, range_end,
-						 le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
-			}
-			0;
-		})));
-
-	snapshots_seen_exit(&s);
-
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
-				     struct bkey_s_c k,
-				     struct nlink_table *links,
-				     size_t *idx, u64 range_end)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_inode_unpacked u;
-	struct nlink *link = &links->d[*idx];
-	int ret = 0;
-
-	if (k.k->p.offset >= range_end)
-		return 1;
-
-	if (!bkey_is_inode(k.k))
-		return 0;
-
-	BUG_ON(bch2_inode_unpack(k, &u));
-
-	if (S_ISDIR(u.bi_mode))
-		return 0;
-
-	if (!u.bi_nlink)
-		return 0;
-
-	while ((cmp_int(link->inum, k.k->p.offset) ?:
-		cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
-		BUG_ON(*idx == links->nr);
-		link = &links->d[++*idx];
-	}
-
-	if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
-			c, inode_wrong_nlink,
-			"inode %llu type %s has wrong i_nlink (%u, should be %u)",
-			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
-			bch2_inode_nlink_get(&u), link->count)) {
-		bch2_inode_nlink_set(&u, link->count);
-		ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
-	}
-fsck_err:
-	return ret;
-}
-
-noinline_for_stack
-static int check_nlinks_update_hardlinks(struct bch_fs *c,
-			       struct nlink_table *links,
-			       u64 range_start, u64 range_end)
-{
-	size_t idx = 0;
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
-				POS(0, range_start),
-				BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
-	if (ret < 0) {
-		bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
-		return ret;
-	}
-
-	return 0;
-}
-
-int bch2_check_nlinks(struct bch_fs *c)
-{
-	struct nlink_table links = { 0 };
-	u64 this_iter_range_start, next_iter_range_start = 0;
-	int ret = 0;
-
-	do {
-		this_iter_range_start = next_iter_range_start;
-		next_iter_range_start = U64_MAX;
-
-		ret = check_nlinks_find_hardlinks(c, &links,
-						  this_iter_range_start,
-						  &next_iter_range_start);
-
-		ret = check_nlinks_walk_dirents(c, &links,
-					  this_iter_range_start,
-					  next_iter_range_start);
-		if (ret)
-			break;
-
-		ret = check_nlinks_update_hardlinks(c, &links,
-					 this_iter_range_start,
-					 next_iter_range_start);
-		if (ret)
-			break;
-
-		links.nr = 0;
-	} while (next_iter_range_start != U64_MAX);
-
-	kvfree(links.d);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
-			     struct bkey_s_c k)
-{
-	struct bkey_s_c_reflink_p p;
-	struct bkey_i_reflink_p *u;
-
-	if (k.k->type != KEY_TYPE_reflink_p)
-		return 0;
-
-	p = bkey_s_c_to_reflink_p(k);
-
-	if (!p.v->front_pad && !p.v->back_pad)
-		return 0;
-
-	u = bch2_trans_kmalloc(trans, sizeof(*u));
-	int ret = PTR_ERR_OR_ZERO(u);
-	if (ret)
-		return ret;
-
-	bkey_reassemble(&u->k_i, k);
-	u->v.front_pad	= 0;
-	u->v.back_pad	= 0;
-
-	return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
-}
-
-int bch2_fix_reflink_p(struct bch_fs *c)
-{
-	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
-		return 0;
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter,
-				BTREE_ID_extents, POS_MIN,
-				BTREE_ITER_intent|BTREE_ITER_prefetch|
-				BTREE_ITER_all_snapshots, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			fix_reflink_p_key(trans, &iter, k)));
-	bch_err_fn(c, ret);
-	return ret;
-}
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
deleted file mode 100644
index a4ef94271784..000000000000
--- a/fs/bcachefs/fsck.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FSCK_H
-#define _BCACHEFS_FSCK_H
-
-int bch2_check_inodes(struct bch_fs *);
-int bch2_check_extents(struct bch_fs *);
-int bch2_check_indirect_extents(struct bch_fs *);
-int bch2_check_dirents(struct bch_fs *);
-int bch2_check_xattrs(struct bch_fs *);
-int bch2_check_root(struct bch_fs *);
-int bch2_check_subvolume_structure(struct bch_fs *);
-int bch2_check_directory_structure(struct bch_fs *);
-int bch2_check_nlinks(struct bch_fs *);
-int bch2_fix_reflink_p(struct bch_fs *);
-
-#endif /* _BCACHEFS_FSCK_H */
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
deleted file mode 100644
index aafa79fa6351..000000000000
--- a/fs/bcachefs/inode.c
+++ /dev/null
@@ -1,1220 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_key_cache.h"
-#include "btree_write_buffer.h"
-#include "bkey_methods.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "compress.h"
-#include "dirent.h"
-#include "error.h"
-#include "extents.h"
-#include "extent_update.h"
-#include "inode.h"
-#include "str_hash.h"
-#include "snapshot.h"
-#include "subvolume.h"
-#include "varint.h"
-
-#include <linux/random.h>
-
-#include <asm/unaligned.h>
-
-#define x(name, ...)	#name,
-const char * const bch2_inode_opts[] = {
-	BCH_INODE_OPTS()
-	NULL,
-};
-
-static const char * const bch2_inode_flag_strs[] = {
-	BCH_INODE_FLAGS()
-	NULL
-};
-#undef  x
-
-static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
-
-static int inode_decode_field(const u8 *in, const u8 *end,
-			      u64 out[2], unsigned *out_bits)
-{
-	__be64 be[2] = { 0, 0 };
-	unsigned bytes, shift;
-	u8 *p;
-
-	if (in >= end)
-		return -1;
-
-	if (!*in)
-		return -1;
-
-	/*
-	 * position of highest set bit indicates number of bytes:
-	 * shift = number of bits to remove in high byte:
-	 */
-	shift	= 8 - __fls(*in); /* 1 <= shift <= 8 */
-	bytes	= byte_table[shift - 1];
-
-	if (in + bytes > end)
-		return -1;
-
-	p = (u8 *) be + 16 - bytes;
-	memcpy(p, in, bytes);
-	*p ^= (1 << 8) >> shift;
-
-	out[0] = be64_to_cpu(be[0]);
-	out[1] = be64_to_cpu(be[1]);
-	*out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]);
-
-	return bytes;
-}
-
-static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed,
-					   const struct bch_inode_unpacked *inode)
-{
-	struct bkey_i_inode_v3 *k = &packed->inode;
-	u8 *out = k->v.fields;
-	u8 *end = (void *) &packed[1];
-	u8 *last_nonzero_field = out;
-	unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
-	unsigned bytes;
-	int ret;
-
-	bkey_inode_v3_init(&packed->inode.k_i);
-	packed->inode.k.p.offset	= inode->bi_inum;
-	packed->inode.v.bi_journal_seq	= cpu_to_le64(inode->bi_journal_seq);
-	packed->inode.v.bi_hash_seed	= inode->bi_hash_seed;
-	packed->inode.v.bi_flags	= cpu_to_le64(inode->bi_flags);
-	packed->inode.v.bi_sectors	= cpu_to_le64(inode->bi_sectors);
-	packed->inode.v.bi_size		= cpu_to_le64(inode->bi_size);
-	packed->inode.v.bi_version	= cpu_to_le64(inode->bi_version);
-	SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode);
-	SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR);
-
-
-#define x(_name, _bits)							\
-	nr_fields++;							\
-									\
-	if (inode->_name) {						\
-		ret = bch2_varint_encode_fast(out, inode->_name);	\
-		out += ret;						\
-									\
-		if (_bits > 64)						\
-			*out++ = 0;					\
-									\
-		last_nonzero_field = out;				\
-		last_nonzero_fieldnr = nr_fields;			\
-	} else {							\
-		*out++ = 0;						\
-									\
-		if (_bits > 64)						\
-			*out++ = 0;					\
-	}
-
-	BCH_INODE_FIELDS_v3()
-#undef  x
-	BUG_ON(out > end);
-
-	out = last_nonzero_field;
-	nr_fields = last_nonzero_fieldnr;
-
-	bytes = out - (u8 *) &packed->inode.v;
-	set_bkey_val_bytes(&packed->inode.k, bytes);
-	memset_u64s_tail(&packed->inode.v, 0, bytes);
-
-	SET_INODEv3_NR_FIELDS(&k->v, nr_fields);
-
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
-		struct bch_inode_unpacked unpacked;
-
-		ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked);
-		BUG_ON(ret);
-		BUG_ON(unpacked.bi_inum		!= inode->bi_inum);
-		BUG_ON(unpacked.bi_hash_seed	!= inode->bi_hash_seed);
-		BUG_ON(unpacked.bi_sectors	!= inode->bi_sectors);
-		BUG_ON(unpacked.bi_size		!= inode->bi_size);
-		BUG_ON(unpacked.bi_version	!= inode->bi_version);
-		BUG_ON(unpacked.bi_mode		!= inode->bi_mode);
-
-#define x(_name, _bits)	if (unpacked._name != inode->_name)		\
-			panic("unpacked %llu should be %llu",		\
-			      (u64) unpacked._name, (u64) inode->_name);
-		BCH_INODE_FIELDS_v3()
-#undef  x
-	}
-}
-
-void bch2_inode_pack(struct bkey_inode_buf *packed,
-		     const struct bch_inode_unpacked *inode)
-{
-	bch2_inode_pack_inlined(packed, inode);
-}
-
-static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
-				struct bch_inode_unpacked *unpacked)
-{
-	const u8 *in = inode.v->fields;
-	const u8 *end = bkey_val_end(inode);
-	u64 field[2];
-	unsigned fieldnr = 0, field_bits;
-	int ret;
-
-#define x(_name, _bits)					\
-	if (fieldnr++ == INODE_NR_FIELDS(inode.v)) {			\
-		unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
-		memset((void *) unpacked + offset, 0,			\
-		       sizeof(*unpacked) - offset);			\
-		return 0;						\
-	}								\
-									\
-	ret = inode_decode_field(in, end, field, &field_bits);		\
-	if (ret < 0)							\
-		return ret;						\
-									\
-	if (field_bits > sizeof(unpacked->_name) * 8)			\
-		return -1;						\
-									\
-	unpacked->_name = field[1];					\
-	in += ret;
-
-	BCH_INODE_FIELDS_v2()
-#undef  x
-
-	/* XXX: signal if there were more fields than expected? */
-	return 0;
-}
-
-static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
-				const u8 *in, const u8 *end,
-				unsigned nr_fields)
-{
-	unsigned fieldnr = 0;
-	int ret;
-	u64 v[2];
-
-#define x(_name, _bits)							\
-	if (fieldnr < nr_fields) {					\
-		ret = bch2_varint_decode_fast(in, end, &v[0]);		\
-		if (ret < 0)						\
-			return ret;					\
-		in += ret;						\
-									\
-		if (_bits > 64) {					\
-			ret = bch2_varint_decode_fast(in, end, &v[1]);	\
-			if (ret < 0)					\
-				return ret;				\
-			in += ret;					\
-		} else {						\
-			v[1] = 0;					\
-		}							\
-	} else {							\
-		v[0] = v[1] = 0;					\
-	}								\
-									\
-	unpacked->_name = v[0];						\
-	if (v[1] || v[0] != unpacked->_name)				\
-		return -1;						\
-	fieldnr++;
-
-	BCH_INODE_FIELDS_v2()
-#undef  x
-
-	/* XXX: signal if there were more fields than expected? */
-	return 0;
-}
-
-static int bch2_inode_unpack_v3(struct bkey_s_c k,
-				struct bch_inode_unpacked *unpacked)
-{
-	struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
-	const u8 *in = inode.v->fields;
-	const u8 *end = bkey_val_end(inode);
-	unsigned nr_fields = INODEv3_NR_FIELDS(inode.v);
-	unsigned fieldnr = 0;
-	int ret;
-	u64 v[2];
-
-	unpacked->bi_inum	= inode.k->p.offset;
-	unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
-	unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
-	unpacked->bi_flags	= le64_to_cpu(inode.v->bi_flags);
-	unpacked->bi_sectors	= le64_to_cpu(inode.v->bi_sectors);
-	unpacked->bi_size	= le64_to_cpu(inode.v->bi_size);
-	unpacked->bi_version	= le64_to_cpu(inode.v->bi_version);
-	unpacked->bi_mode	= INODEv3_MODE(inode.v);
-
-#define x(_name, _bits)							\
-	if (fieldnr < nr_fields) {					\
-		ret = bch2_varint_decode_fast(in, end, &v[0]);		\
-		if (ret < 0)						\
-			return ret;					\
-		in += ret;						\
-									\
-		if (_bits > 64) {					\
-			ret = bch2_varint_decode_fast(in, end, &v[1]);	\
-			if (ret < 0)					\
-				return ret;				\
-			in += ret;					\
-		} else {						\
-			v[1] = 0;					\
-		}							\
-	} else {							\
-		v[0] = v[1] = 0;					\
-	}								\
-									\
-	unpacked->_name = v[0];						\
-	if (v[1] || v[0] != unpacked->_name)				\
-		return -1;						\
-	fieldnr++;
-
-	BCH_INODE_FIELDS_v3()
-#undef  x
-
-	/* XXX: signal if there were more fields than expected? */
-	return 0;
-}
-
-static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
-					       struct bch_inode_unpacked *unpacked)
-{
-	memset(unpacked, 0, sizeof(*unpacked));
-
-	switch (k.k->type) {
-	case KEY_TYPE_inode: {
-		struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
-
-		unpacked->bi_inum	= inode.k->p.offset;
-		unpacked->bi_journal_seq= 0;
-		unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
-		unpacked->bi_flags	= le32_to_cpu(inode.v->bi_flags);
-		unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
-
-		if (INODE_NEW_VARINT(inode.v)) {
-			return bch2_inode_unpack_v2(unpacked, inode.v->fields,
-						    bkey_val_end(inode),
-						    INODE_NR_FIELDS(inode.v));
-		} else {
-			return bch2_inode_unpack_v1(inode, unpacked);
-		}
-		break;
-	}
-	case KEY_TYPE_inode_v2: {
-		struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
-
-		unpacked->bi_inum	= inode.k->p.offset;
-		unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
-		unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
-		unpacked->bi_flags	= le64_to_cpu(inode.v->bi_flags);
-		unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
-
-		return bch2_inode_unpack_v2(unpacked, inode.v->fields,
-					    bkey_val_end(inode),
-					    INODEv2_NR_FIELDS(inode.v));
-	}
-	default:
-		BUG();
-	}
-}
-
-int bch2_inode_unpack(struct bkey_s_c k,
-		      struct bch_inode_unpacked *unpacked)
-{
-	if (likely(k.k->type == KEY_TYPE_inode_v3))
-		return bch2_inode_unpack_v3(k, unpacked);
-	return bch2_inode_unpack_slowpath(k, unpacked);
-}
-
-int bch2_inode_peek_nowarn(struct btree_trans *trans,
-		    struct btree_iter *iter,
-		    struct bch_inode_unpacked *inode,
-		    subvol_inum inum, unsigned flags)
-{
-	struct bkey_s_c k;
-	u32 snapshot;
-	int ret;
-
-	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (ret)
-		return ret;
-
-	k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes,
-			       SPOS(0, inum.inum, snapshot),
-			       flags|BTREE_ITER_cached);
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
-	if (ret)
-		goto err;
-
-	ret = bch2_inode_unpack(k, inode);
-	if (ret)
-		goto err;
-
-	return 0;
-err:
-	bch2_trans_iter_exit(trans, iter);
-	return ret;
-}
-
-int bch2_inode_peek(struct btree_trans *trans,
-		    struct btree_iter *iter,
-		    struct bch_inode_unpacked *inode,
-		    subvol_inum inum, unsigned flags)
-{
-	int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
-	bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum);
-	return ret;
-}
-
-int bch2_inode_write_flags(struct btree_trans *trans,
-		     struct btree_iter *iter,
-		     struct bch_inode_unpacked *inode,
-		     enum btree_iter_update_trigger_flags flags)
-{
-	struct bkey_inode_buf *inode_p;
-
-	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
-	if (IS_ERR(inode_p))
-		return PTR_ERR(inode_p);
-
-	bch2_inode_pack_inlined(inode_p, inode);
-	inode_p->inode.k.p.snapshot = iter->snapshot;
-	return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
-}
-
-int __bch2_fsck_write_inode(struct btree_trans *trans,
-			 struct bch_inode_unpacked *inode,
-			 u32 snapshot)
-{
-	struct bkey_inode_buf *inode_p =
-		bch2_trans_kmalloc(trans, sizeof(*inode_p));
-
-	if (IS_ERR(inode_p))
-		return PTR_ERR(inode_p);
-
-	bch2_inode_pack(inode_p, inode);
-	inode_p->inode.k.p.snapshot = snapshot;
-
-	return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
-				&inode_p->inode.k_i,
-				BTREE_UPDATE_internal_snapshot_node);
-}
-
-int bch2_fsck_write_inode(struct btree_trans *trans,
-			    struct bch_inode_unpacked *inode,
-			    u32 snapshot)
-{
-	int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			    __bch2_fsck_write_inode(trans, inode, snapshot));
-	bch_err_fn(trans->c, ret);
-	return ret;
-}
-
-struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
-{
-	struct bch_inode_unpacked u;
-	struct bkey_inode_buf *inode_p;
-	int ret;
-
-	if (!bkey_is_inode(&k->k))
-		return ERR_PTR(-ENOENT);
-
-	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
-	if (IS_ERR(inode_p))
-		return ERR_CAST(inode_p);
-
-	ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u);
-	if (ret)
-		return ERR_PTR(ret);
-
-	bch2_inode_pack(inode_p, &u);
-	return &inode_p->inode.k_i;
-}
-
-static int __bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
-{
-	struct bch_inode_unpacked unpacked;
-	int ret = 0;
-
-	bkey_fsck_err_on(k.k->p.inode, c, err,
-			 inode_pos_inode_nonzero,
-			 "nonzero k.p.inode");
-
-	bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, c, err,
-			 inode_pos_blockdev_range,
-			 "fs inode in blockdev range");
-
-	bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), c, err,
-			 inode_unpack_error,
-			 "invalid variable length fields");
-
-	bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, c, err,
-			 inode_checksum_type_invalid,
-			 "invalid data checksum type (%u >= %u",
-			 unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
-
-	bkey_fsck_err_on(unpacked.bi_compression &&
-			 !bch2_compression_opt_valid(unpacked.bi_compression - 1), c, err,
-			 inode_compression_type_invalid,
-			 "invalid compression opt %u", unpacked.bi_compression - 1);
-
-	bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) &&
-			 unpacked.bi_nlink != 0, c, err,
-			 inode_unlinked_but_nlink_nonzero,
-			 "flagged as unlinked but bi_nlink != 0");
-
-	bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), c, err,
-			 inode_subvol_root_but_not_dir,
-			 "subvolume root but not a directory");
-fsck_err:
-	return ret;
-}
-
-int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k,
-		       enum bch_validate_flags flags,
-		       struct printbuf *err)
-{
-	struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
-	int ret = 0;
-
-	bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
-			 inode_str_hash_invalid,
-			 "invalid str hash type (%llu >= %u)",
-			 INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
-
-	ret = __bch2_inode_invalid(c, k, err);
-fsck_err:
-	return ret;
-}
-
-int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
-			  enum bch_validate_flags flags,
-			  struct printbuf *err)
-{
-	struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
-	int ret = 0;
-
-	bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
-			 inode_str_hash_invalid,
-			 "invalid str hash type (%llu >= %u)",
-			 INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
-
-	ret = __bch2_inode_invalid(c, k, err);
-fsck_err:
-	return ret;
-}
-
-int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
-			  enum bch_validate_flags flags,
-			  struct printbuf *err)
-{
-	struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
-	int ret = 0;
-
-	bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
-			 INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), c, err,
-			 inode_v3_fields_start_bad,
-			 "invalid fields_start (got %llu, min %u max %zu)",
-			 INODEv3_FIELDS_START(inode.v),
-			 INODEv3_FIELDS_START_INITIAL,
-			 bkey_val_u64s(inode.k));
-
-	bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
-			 inode_str_hash_invalid,
-			 "invalid str hash type (%llu >= %u)",
-			 INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
-
-	ret = __bch2_inode_invalid(c, k, err);
-fsck_err:
-	return ret;
-}
-
-static void __bch2_inode_unpacked_to_text(struct printbuf *out,
-					  struct bch_inode_unpacked *inode)
-{
-	printbuf_indent_add(out, 2);
-	prt_printf(out, "mode=%o\n", inode->bi_mode);
-
-	prt_str(out, "flags=");
-	prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
-	prt_printf(out, " (%x)\n", inode->bi_flags);
-
-	prt_printf(out, "journal_seq=%llu\n",	inode->bi_journal_seq);
-	prt_printf(out, "bi_size=%llu\n",	inode->bi_size);
-	prt_printf(out, "bi_sectors=%llu\n",	inode->bi_sectors);
-	prt_printf(out, "bi_version=%llu\n",	inode->bi_version);
-
-#define x(_name, _bits)						\
-	prt_printf(out, #_name "=%llu\n", (u64) inode->_name);
-	BCH_INODE_FIELDS_v3()
-#undef  x
-	printbuf_indent_sub(out, 2);
-}
-
-void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
-{
-	prt_printf(out, "inum: %llu ", inode->bi_inum);
-	__bch2_inode_unpacked_to_text(out, inode);
-}
-
-void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bch_inode_unpacked inode;
-
-	if (bch2_inode_unpack(k, &inode)) {
-		prt_printf(out, "(unpack error)");
-		return;
-	}
-
-	__bch2_inode_unpacked_to_text(out, &inode);
-}
-
-static inline u64 bkey_inode_flags(struct bkey_s_c k)
-{
-	switch (k.k->type) {
-	case KEY_TYPE_inode:
-		return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
-	case KEY_TYPE_inode_v2:
-		return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
-	case KEY_TYPE_inode_v3:
-		return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
-	default:
-		return 0;
-	}
-}
-
-static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
-{
-	return bkey_inode_flags(k) & BCH_INODE_unlinked;
-}
-
-int bch2_trigger_inode(struct btree_trans *trans,
-		       enum btree_id btree_id, unsigned level,
-		       struct bkey_s_c old,
-		       struct bkey_s new,
-		       enum btree_iter_update_trigger_flags flags)
-{
-	s64 nr = (s64) bkey_is_inode(new.k) - (s64) bkey_is_inode(old.k);
-
-	if (flags & BTREE_TRIGGER_transactional) {
-		if (nr) {
-			int ret = bch2_replicas_deltas_realloc(trans, 0);
-			if (ret)
-				return ret;
-
-			trans->fs_usage_deltas->nr_inodes += nr;
-		}
-
-		bool old_deleted = bkey_is_deleted_inode(old);
-		bool new_deleted = bkey_is_deleted_inode(new.s_c);
-		if (old_deleted != new_deleted) {
-			int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
-							      new.k->p, new_deleted);
-			if (ret)
-				return ret;
-		}
-	}
-
-	if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
-		BUG_ON(!trans->journal_res.seq);
-
-		bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
-	}
-
-	if (flags & BTREE_TRIGGER_gc) {
-		struct bch_fs *c = trans->c;
-
-		percpu_down_read(&c->mark_lock);
-		this_cpu_add(c->usage_gc->b.nr_inodes, nr);
-		percpu_up_read(&c->mark_lock);
-	}
-
-	return 0;
-}
-
-int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k,
-				  enum bch_validate_flags flags,
-				  struct printbuf *err)
-{
-	int ret = 0;
-
-	bkey_fsck_err_on(k.k->p.inode, c, err,
-			 inode_pos_inode_nonzero,
-			 "nonzero k.p.inode");
-fsck_err:
-	return ret;
-}
-
-void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
-				   struct bkey_s_c k)
-{
-	struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k);
-
-	prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
-}
-
-void bch2_inode_init_early(struct bch_fs *c,
-			   struct bch_inode_unpacked *inode_u)
-{
-	enum bch_str_hash_type str_hash =
-		bch2_str_hash_opt_to_type(c, c->opts.str_hash);
-
-	memset(inode_u, 0, sizeof(*inode_u));
-
-	/* ick */
-	inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET;
-	get_random_bytes(&inode_u->bi_hash_seed,
-			 sizeof(inode_u->bi_hash_seed));
-}
-
-void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
-			  uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
-			  struct bch_inode_unpacked *parent)
-{
-	inode_u->bi_mode	= mode;
-	inode_u->bi_uid		= uid;
-	inode_u->bi_gid		= gid;
-	inode_u->bi_dev		= rdev;
-	inode_u->bi_atime	= now;
-	inode_u->bi_mtime	= now;
-	inode_u->bi_ctime	= now;
-	inode_u->bi_otime	= now;
-
-	if (parent && parent->bi_mode & S_ISGID) {
-		inode_u->bi_gid = parent->bi_gid;
-		if (S_ISDIR(mode))
-			inode_u->bi_mode |= S_ISGID;
-	}
-
-	if (parent) {
-#define x(_name, ...)	inode_u->bi_##_name = parent->bi_##_name;
-		BCH_INODE_OPTS()
-#undef x
-	}
-}
-
-void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
-		     uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
-		     struct bch_inode_unpacked *parent)
-{
-	bch2_inode_init_early(c, inode_u);
-	bch2_inode_init_late(inode_u, bch2_current_time(c),
-			     uid, gid, mode, rdev, parent);
-}
-
-static inline u32 bkey_generation(struct bkey_s_c k)
-{
-	switch (k.k->type) {
-	case KEY_TYPE_inode:
-	case KEY_TYPE_inode_v2:
-		BUG();
-	case KEY_TYPE_inode_generation:
-		return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
-	default:
-		return 0;
-	}
-}
-
-/*
- * This just finds an empty slot:
- */
-int bch2_inode_create(struct btree_trans *trans,
-		      struct btree_iter *iter,
-		      struct bch_inode_unpacked *inode_u,
-		      u32 snapshot, u64 cpu)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_s_c k;
-	u64 min, max, start, pos, *hint;
-	int ret = 0;
-	unsigned bits = (c->opts.inodes_32bit ? 31 : 63);
-
-	if (c->opts.shard_inode_numbers) {
-		bits -= c->inode_shard_bits;
-
-		min = (cpu << bits);
-		max = (cpu << bits) | ~(ULLONG_MAX << bits);
-
-		min = max_t(u64, min, BLOCKDEV_INODE_MAX);
-		hint = c->unused_inode_hints + cpu;
-	} else {
-		min = BLOCKDEV_INODE_MAX;
-		max = ~(ULLONG_MAX << bits);
-		hint = c->unused_inode_hints;
-	}
-
-	start = READ_ONCE(*hint);
-
-	if (start >= max || start < min)
-		start = min;
-
-	pos = start;
-	bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
-			     BTREE_ITER_all_snapshots|
-			     BTREE_ITER_intent);
-again:
-	while ((k = bch2_btree_iter_peek(iter)).k &&
-	       !(ret = bkey_err(k)) &&
-	       bkey_lt(k.k->p, POS(0, max))) {
-		if (pos < iter->pos.offset)
-			goto found_slot;
-
-		/*
-		 * We don't need to iterate over keys in every snapshot once
-		 * we've found just one:
-		 */
-		pos = iter->pos.offset + 1;
-		bch2_btree_iter_set_pos(iter, POS(0, pos));
-	}
-
-	if (!ret && pos < max)
-		goto found_slot;
-
-	if (!ret && start == min)
-		ret = -BCH_ERR_ENOSPC_inode_create;
-
-	if (ret) {
-		bch2_trans_iter_exit(trans, iter);
-		return ret;
-	}
-
-	/* Retry from start */
-	pos = start = min;
-	bch2_btree_iter_set_pos(iter, POS(0, pos));
-	goto again;
-found_slot:
-	bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
-	k = bch2_btree_iter_peek_slot(iter);
-	ret = bkey_err(k);
-	if (ret) {
-		bch2_trans_iter_exit(trans, iter);
-		return ret;
-	}
-
-	*hint			= k.k->p.offset;
-	inode_u->bi_inum	= k.k->p.offset;
-	inode_u->bi_generation	= bkey_generation(k);
-	return 0;
-}
-
-static int bch2_inode_delete_keys(struct btree_trans *trans,
-				  subvol_inum inum, enum btree_id id)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey_i delete;
-	struct bpos end = POS(inum.inum, U64_MAX);
-	u32 snapshot;
-	int ret = 0;
-
-	/*
-	 * We're never going to be deleting partial extents, no need to use an
-	 * extent iterator:
-	 */
-	bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
-			     BTREE_ITER_intent);
-
-	while (1) {
-		bch2_trans_begin(trans);
-
-		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-		if (ret)
-			goto err;
-
-		bch2_btree_iter_set_snapshot(&iter, snapshot);
-
-		k = bch2_btree_iter_peek_upto(&iter, end);
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
-
-		if (!k.k)
-			break;
-
-		bkey_init(&delete.k);
-		delete.k.p = iter.pos;
-
-		if (iter.flags & BTREE_ITER_is_extents)
-			bch2_key_resize(&delete.k,
-					bpos_min(end, k.k->p).offset -
-					iter.pos.offset);
-
-		ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
-		      bch2_trans_commit(trans, NULL, NULL,
-					BCH_TRANS_COMMIT_no_enospc);
-err:
-		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			break;
-	}
-
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter = { NULL };
-	struct bkey_i_inode_generation delete;
-	struct bch_inode_unpacked inode_u;
-	struct bkey_s_c k;
-	u32 snapshot;
-	int ret;
-
-	/*
-	 * If this was a directory, there shouldn't be any real dirents left -
-	 * but there could be whiteouts (from hash collisions) that we should
-	 * delete:
-	 *
-	 * XXX: the dirent could ideally would delete whiteouts when they're no
-	 * longer needed
-	 */
-	ret   = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?:
-		bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?:
-		bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents);
-	if (ret)
-		goto err;
-retry:
-	bch2_trans_begin(trans);
-
-	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (ret)
-		goto err;
-
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
-			       SPOS(0, inum.inum, snapshot),
-			       BTREE_ITER_intent|BTREE_ITER_cached);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (!bkey_is_inode(k.k)) {
-		bch2_fs_inconsistent(c,
-				     "inode %llu:%u not found when deleting",
-				     inum.inum, snapshot);
-		ret = -EIO;
-		goto err;
-	}
-
-	bch2_inode_unpack(k, &inode_u);
-
-	bkey_inode_generation_init(&delete.k_i);
-	delete.k.p = iter.pos;
-	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
-
-	ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
-		bch2_trans_commit(trans, NULL, NULL,
-				BCH_TRANS_COMMIT_no_enospc);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	bch2_trans_put(trans);
-	return ret;
-}
-
-int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
-				  subvol_inum inum,
-				  struct bch_inode_unpacked *inode)
-{
-	struct btree_iter iter;
-	int ret;
-
-	ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
-	if (!ret)
-		bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
-				  subvol_inum inum,
-				  struct bch_inode_unpacked *inode)
-{
-	struct btree_iter iter;
-	int ret;
-
-	ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
-	if (!ret)
-		bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
-			    struct bch_inode_unpacked *inode)
-{
-	return bch2_trans_do(c, NULL, NULL, 0,
-		bch2_inode_find_by_inum_trans(trans, inum, inode));
-}
-
-int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
-{
-	if (bi->bi_flags & BCH_INODE_unlinked)
-		bi->bi_flags &= ~BCH_INODE_unlinked;
-	else {
-		if (bi->bi_nlink == U32_MAX)
-			return -EINVAL;
-
-		bi->bi_nlink++;
-	}
-
-	return 0;
-}
-
-void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi)
-{
-	if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) {
-		bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero",
-					bi->bi_inum);
-		return;
-	}
-
-	if (bi->bi_flags & BCH_INODE_unlinked) {
-		bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum);
-		return;
-	}
-
-	if (bi->bi_nlink)
-		bi->bi_nlink--;
-	else
-		bi->bi_flags |= BCH_INODE_unlinked;
-}
-
-struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
-{
-	struct bch_opts ret = { 0 };
-#define x(_name, _bits)							\
-	if (inode->bi_##_name)						\
-		opt_set(ret, _name, inode->bi_##_name - 1);
-	BCH_INODE_OPTS()
-#undef x
-	return ret;
-}
-
-void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
-			 struct bch_inode_unpacked *inode)
-{
-#define x(_name, _bits)		opts->_name = inode_opt_get(c, inode, _name);
-	BCH_INODE_OPTS()
-#undef x
-
-	if (opts->nocow)
-		opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0;
-}
-
-int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
-{
-	struct bch_inode_unpacked inode;
-	int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode));
-
-	if (ret)
-		return ret;
-
-	bch2_inode_opts_get(opts, trans->c, &inode);
-	return 0;
-}
-
-int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter = { NULL };
-	struct bkey_i_inode_generation delete;
-	struct bch_inode_unpacked inode_u;
-	struct bkey_s_c k;
-	int ret;
-
-	do {
-		ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
-						      SPOS(inum, 0, snapshot),
-						      SPOS(inum, U64_MAX, snapshot),
-						      0, NULL) ?:
-			bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
-						      SPOS(inum, 0, snapshot),
-						      SPOS(inum, U64_MAX, snapshot),
-						      0, NULL) ?:
-			bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
-						      SPOS(inum, 0, snapshot),
-						      SPOS(inum, U64_MAX, snapshot),
-						      0, NULL);
-	} while (ret == -BCH_ERR_transaction_restart_nested);
-	if (ret)
-		goto err;
-retry:
-	bch2_trans_begin(trans);
-
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
-			       SPOS(0, inum, snapshot), BTREE_ITER_intent);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (!bkey_is_inode(k.k)) {
-		bch2_fs_inconsistent(c,
-				     "inode %llu:%u not found when deleting",
-				     inum, snapshot);
-		ret = -EIO;
-		goto err;
-	}
-
-	bch2_inode_unpack(k, &inode_u);
-
-	/* Subvolume root? */
-	if (inode_u.bi_subvol)
-		bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
-
-	bkey_inode_generation_init(&delete.k_i);
-	delete.k.p = iter.pos;
-	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
-
-	ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
-		bch2_trans_commit(trans, NULL, NULL,
-				BCH_TRANS_COMMIT_no_enospc);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	return ret ?: -BCH_ERR_transaction_restart_nested;
-}
-
-static int may_delete_deleted_inode(struct btree_trans *trans,
-				    struct btree_iter *iter,
-				    struct bpos pos,
-				    bool *need_another_pass)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter inode_iter;
-	struct bkey_s_c k;
-	struct bch_inode_unpacked inode;
-	int ret;
-
-	k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached);
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
-	if (fsck_err_on(!bkey_is_inode(k.k), c,
-			deleted_inode_missing,
-			"nonexistent inode %llu:%u in deleted_inodes btree",
-			pos.offset, pos.snapshot))
-		goto delete;
-
-	ret = bch2_inode_unpack(k, &inode);
-	if (ret)
-		goto out;
-
-	if (S_ISDIR(inode.bi_mode)) {
-		ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot);
-		if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY),
-				c, deleted_inode_is_dir,
-				"non empty directory %llu:%u in deleted_inodes btree",
-				pos.offset, pos.snapshot))
-			goto delete;
-		if (ret)
-			goto out;
-	}
-
-	if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), c,
-			deleted_inode_not_unlinked,
-			"non-deleted inode %llu:%u in deleted_inodes btree",
-			pos.offset, pos.snapshot))
-		goto delete;
-
-	if (c->sb.clean &&
-	    !fsck_err(c,
-		      deleted_inode_but_clean,
-		      "filesystem marked as clean but have deleted inode %llu:%u",
-		      pos.offset, pos.snapshot)) {
-		ret = 0;
-		goto out;
-	}
-
-	if (bch2_snapshot_is_internal_node(c, pos.snapshot)) {
-		struct bpos new_min_pos;
-
-		ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos);
-		if (ret)
-			goto out;
-
-		inode.bi_flags &= ~BCH_INODE_unlinked;
-
-		ret = bch2_inode_write_flags(trans, &inode_iter, &inode,
-					     BTREE_UPDATE_internal_snapshot_node);
-		bch_err_msg(c, ret, "clearing inode unlinked flag");
-		if (ret)
-			goto out;
-
-		/*
-		 * We'll need another write buffer flush to pick up the new
-		 * unlinked inodes in the snapshot leaves:
-		 */
-		*need_another_pass = true;
-		goto out;
-	}
-
-	ret = 1;
-out:
-fsck_err:
-	bch2_trans_iter_exit(trans, &inode_iter);
-	return ret;
-delete:
-	ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false);
-	goto out;
-}
-
-int bch2_delete_dead_inodes(struct bch_fs *c)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	bool need_another_pass;
-	int ret;
-again:
-	/*
-	 * if we ran check_inodes() unlinked inodes will have already been
-	 * cleaned up but the write buffer will be out of sync; therefore we
-	 * alway need a write buffer flush
-	 */
-	ret = bch2_btree_write_buffer_flush_sync(trans);
-	if (ret)
-		goto err;
-
-	need_another_pass = false;
-
-	/*
-	 * Weird transaction restart handling here because on successful delete,
-	 * bch2_inode_rm_snapshot() will return a nested transaction restart,
-	 * but we can't retry because the btree write buffer won't have been
-	 * flushed and we'd spin:
-	 */
-	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
-					BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-					NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
-		ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass);
-		if (ret > 0) {
-			bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot);
-
-			ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
-			/*
-			 * We don't want to loop here: a transaction restart
-			 * error here means we handled a transaction restart and
-			 * we're actually done, but if we loop we'll retry the
-			 * same key because the write buffer hasn't been flushed
-			 * yet
-			 */
-			if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-				ret = 0;
-				continue;
-			}
-		}
-
-		ret;
-	}));
-
-	if (!ret && need_another_pass)
-		goto again;
-err:
-	bch2_trans_put(trans);
-	return ret;
-}
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
deleted file mode 100644
index 679f5f5e5d15..000000000000
--- a/fs/bcachefs/inode.h
+++ /dev/null
@@ -1,240 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_INODE_H
-#define _BCACHEFS_INODE_H
-
-#include "bkey.h"
-#include "bkey_methods.h"
-#include "opts.h"
-
-enum bch_validate_flags;
-extern const char * const bch2_inode_opts[];
-
-int bch2_inode_invalid(struct bch_fs *, struct bkey_s_c,
-		       enum bch_validate_flags, struct printbuf *);
-int bch2_inode_v2_invalid(struct bch_fs *, struct bkey_s_c,
-			  enum bch_validate_flags, struct printbuf *);
-int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c,
-			  enum bch_validate_flags, struct printbuf *);
-void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned,
-		       struct bkey_s_c, struct bkey_s,
-		       enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_inode ((struct bkey_ops) {	\
-	.key_invalid	= bch2_inode_invalid,		\
-	.val_to_text	= bch2_inode_to_text,		\
-	.trigger	= bch2_trigger_inode,		\
-	.min_val_size	= 16,				\
-})
-
-#define bch2_bkey_ops_inode_v2 ((struct bkey_ops) {	\
-	.key_invalid	= bch2_inode_v2_invalid,	\
-	.val_to_text	= bch2_inode_to_text,		\
-	.trigger	= bch2_trigger_inode,		\
-	.min_val_size	= 32,				\
-})
-
-#define bch2_bkey_ops_inode_v3 ((struct bkey_ops) {	\
-	.key_invalid	= bch2_inode_v3_invalid,	\
-	.val_to_text	= bch2_inode_to_text,		\
-	.trigger	= bch2_trigger_inode,		\
-	.min_val_size	= 48,				\
-})
-
-static inline bool bkey_is_inode(const struct bkey *k)
-{
-	return  k->type == KEY_TYPE_inode ||
-		k->type == KEY_TYPE_inode_v2 ||
-		k->type == KEY_TYPE_inode_v3;
-}
-
-int bch2_inode_generation_invalid(struct bch_fs *, struct bkey_s_c,
-				  enum bch_validate_flags, struct printbuf *);
-void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_inode_generation ((struct bkey_ops) {	\
-	.key_invalid	= bch2_inode_generation_invalid,	\
-	.val_to_text	= bch2_inode_generation_to_text,	\
-	.min_val_size	= 8,					\
-})
-
-#if 0
-typedef struct {
-	u64			lo;
-	u32			hi;
-} __packed __aligned(4) u96;
-#endif
-typedef u64 u96;
-
-struct bch_inode_unpacked {
-	u64			bi_inum;
-	u64			bi_journal_seq;
-	__le64			bi_hash_seed;
-	u64			bi_size;
-	u64			bi_sectors;
-	u64			bi_version;
-	u32			bi_flags;
-	u16			bi_mode;
-
-#define x(_name, _bits)	u##_bits _name;
-	BCH_INODE_FIELDS_v3()
-#undef  x
-};
-
-struct bkey_inode_buf {
-	struct bkey_i_inode_v3	inode;
-
-#define x(_name, _bits)		+ 8 + _bits / 8
-	u8		_pad[0 + BCH_INODE_FIELDS_v3()];
-#undef  x
-} __packed __aligned(8);
-
-void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
-int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *);
-struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *);
-
-void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
-
-int bch2_inode_peek_nowarn(struct btree_trans *, struct btree_iter *,
-		    struct bch_inode_unpacked *, subvol_inum, unsigned);
-int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
-		    struct bch_inode_unpacked *, subvol_inum, unsigned);
-
-int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *,
-		     struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags);
-
-static inline int bch2_inode_write(struct btree_trans *trans,
-		     struct btree_iter *iter,
-		     struct bch_inode_unpacked *inode)
-{
-	return bch2_inode_write_flags(trans, iter, inode, 0);
-}
-
-int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32);
-int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32);
-
-void bch2_inode_init_early(struct bch_fs *,
-			   struct bch_inode_unpacked *);
-void bch2_inode_init_late(struct bch_inode_unpacked *, u64,
-			  uid_t, gid_t, umode_t, dev_t,
-			  struct bch_inode_unpacked *);
-void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
-		     uid_t, gid_t, umode_t, dev_t,
-		     struct bch_inode_unpacked *);
-
-int bch2_inode_create(struct btree_trans *, struct btree_iter *,
-		      struct bch_inode_unpacked *, u32, u64);
-
-int bch2_inode_rm(struct bch_fs *, subvol_inum);
-
-int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *,
-				  subvol_inum,
-				  struct bch_inode_unpacked *);
-int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
-				  struct bch_inode_unpacked *);
-int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
-			    struct bch_inode_unpacked *);
-
-#define inode_opt_get(_c, _inode, _name)			\
-	((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name)
-
-static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
-				      enum inode_opt_id id, u64 v)
-{
-	switch (id) {
-#define x(_name, ...)							\
-	case Inode_opt_##_name:						\
-		inode->bi_##_name = v;					\
-		break;
-	BCH_INODE_OPTS()
-#undef x
-	default:
-		BUG();
-	}
-}
-
-static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode,
-				     enum inode_opt_id id)
-{
-	switch (id) {
-#define x(_name, ...)							\
-	case Inode_opt_##_name:						\
-		return inode->bi_##_name;
-	BCH_INODE_OPTS()
-#undef x
-	default:
-		BUG();
-	}
-}
-
-static inline u8 mode_to_type(umode_t mode)
-{
-	return (mode >> 12) & 15;
-}
-
-static inline u8 inode_d_type(struct bch_inode_unpacked *inode)
-{
-	return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode);
-}
-
-static inline u32 bch2_inode_flags(struct bkey_s_c k)
-{
-	switch (k.k->type) {
-	case KEY_TYPE_inode:
-		return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
-	case KEY_TYPE_inode_v2:
-		return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
-	case KEY_TYPE_inode_v3:
-		return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
-	default:
-		return 0;
-	}
-}
-
-/* i_nlink: */
-
-static inline unsigned nlink_bias(umode_t mode)
-{
-	return S_ISDIR(mode) ? 2 : 1;
-}
-
-static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi)
-{
-	return bi->bi_flags & BCH_INODE_unlinked
-		  ? 0
-		  : bi->bi_nlink + nlink_bias(bi->bi_mode);
-}
-
-static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
-					unsigned nlink)
-{
-	if (nlink) {
-		bi->bi_nlink = nlink - nlink_bias(bi->bi_mode);
-		bi->bi_flags &= ~BCH_INODE_unlinked;
-	} else {
-		bi->bi_nlink = 0;
-		bi->bi_flags |= BCH_INODE_unlinked;
-	}
-}
-
-int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
-void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
-
-static inline bool bch2_inode_should_have_bp(struct bch_inode_unpacked *inode)
-{
-	bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset;
-
-	return S_ISDIR(inode->bi_mode) ||
-		(!inode->bi_nlink && inode_has_bp);
-}
-
-struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
-void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
-			 struct bch_inode_unpacked *);
-int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *);
-
-int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
-int bch2_delete_dead_inodes(struct bch_fs *);
-
-#endif /* _BCACHEFS_INODE_H */
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
deleted file mode 100644
index 83d107331edf..000000000000
--- a/fs/bcachefs/inode_format.h
+++ /dev/null
@@ -1,166 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_INODE_FORMAT_H
-#define _BCACHEFS_INODE_FORMAT_H
-
-#define BLOCKDEV_INODE_MAX	4096
-#define BCACHEFS_ROOT_INO	4096
-
-struct bch_inode {
-	struct bch_val		v;
-
-	__le64			bi_hash_seed;
-	__le32			bi_flags;
-	__le16			bi_mode;
-	__u8			fields[];
-} __packed __aligned(8);
-
-struct bch_inode_v2 {
-	struct bch_val		v;
-
-	__le64			bi_journal_seq;
-	__le64			bi_hash_seed;
-	__le64			bi_flags;
-	__le16			bi_mode;
-	__u8			fields[];
-} __packed __aligned(8);
-
-struct bch_inode_v3 {
-	struct bch_val		v;
-
-	__le64			bi_journal_seq;
-	__le64			bi_hash_seed;
-	__le64			bi_flags;
-	__le64			bi_sectors;
-	__le64			bi_size;
-	__le64			bi_version;
-	__u8			fields[];
-} __packed __aligned(8);
-
-#define INODEv3_FIELDS_START_INITIAL	6
-#define INODEv3_FIELDS_START_CUR	(offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
-
-struct bch_inode_generation {
-	struct bch_val		v;
-
-	__le32			bi_generation;
-	__le32			pad;
-} __packed __aligned(8);
-
-/*
- * bi_subvol and bi_parent_subvol are only set for subvolume roots:
- */
-
-#define BCH_INODE_FIELDS_v2()			\
-	x(bi_atime,			96)	\
-	x(bi_ctime,			96)	\
-	x(bi_mtime,			96)	\
-	x(bi_otime,			96)	\
-	x(bi_size,			64)	\
-	x(bi_sectors,			64)	\
-	x(bi_uid,			32)	\
-	x(bi_gid,			32)	\
-	x(bi_nlink,			32)	\
-	x(bi_generation,		32)	\
-	x(bi_dev,			32)	\
-	x(bi_data_checksum,		8)	\
-	x(bi_compression,		8)	\
-	x(bi_project,			32)	\
-	x(bi_background_compression,	8)	\
-	x(bi_data_replicas,		8)	\
-	x(bi_promote_target,		16)	\
-	x(bi_foreground_target,		16)	\
-	x(bi_background_target,		16)	\
-	x(bi_erasure_code,		16)	\
-	x(bi_fields_set,		16)	\
-	x(bi_dir,			64)	\
-	x(bi_dir_offset,		64)	\
-	x(bi_subvol,			32)	\
-	x(bi_parent_subvol,		32)
-
-#define BCH_INODE_FIELDS_v3()			\
-	x(bi_atime,			96)	\
-	x(bi_ctime,			96)	\
-	x(bi_mtime,			96)	\
-	x(bi_otime,			96)	\
-	x(bi_uid,			32)	\
-	x(bi_gid,			32)	\
-	x(bi_nlink,			32)	\
-	x(bi_generation,		32)	\
-	x(bi_dev,			32)	\
-	x(bi_data_checksum,		8)	\
-	x(bi_compression,		8)	\
-	x(bi_project,			32)	\
-	x(bi_background_compression,	8)	\
-	x(bi_data_replicas,		8)	\
-	x(bi_promote_target,		16)	\
-	x(bi_foreground_target,		16)	\
-	x(bi_background_target,		16)	\
-	x(bi_erasure_code,		16)	\
-	x(bi_fields_set,		16)	\
-	x(bi_dir,			64)	\
-	x(bi_dir_offset,		64)	\
-	x(bi_subvol,			32)	\
-	x(bi_parent_subvol,		32)	\
-	x(bi_nocow,			8)
-
-/* subset of BCH_INODE_FIELDS */
-#define BCH_INODE_OPTS()			\
-	x(data_checksum,		8)	\
-	x(compression,			8)	\
-	x(project,			32)	\
-	x(background_compression,	8)	\
-	x(data_replicas,		8)	\
-	x(promote_target,		16)	\
-	x(foreground_target,		16)	\
-	x(background_target,		16)	\
-	x(erasure_code,			16)	\
-	x(nocow,			8)
-
-enum inode_opt_id {
-#define x(name, ...)				\
-	Inode_opt_##name,
-	BCH_INODE_OPTS()
-#undef  x
-	Inode_opt_nr,
-};
-
-#define BCH_INODE_FLAGS()			\
-	x(sync,				0)	\
-	x(immutable,			1)	\
-	x(append,			2)	\
-	x(nodump,			3)	\
-	x(noatime,			4)	\
-	x(i_size_dirty,			5)	\
-	x(i_sectors_dirty,		6)	\
-	x(unlinked,			7)	\
-	x(backptr_untrusted,		8)
-
-/* bits 20+ reserved for packed fields below: */
-
-enum bch_inode_flags {
-#define x(t, n)	BCH_INODE_##t = 1U << n,
-	BCH_INODE_FLAGS()
-#undef x
-};
-
-enum __bch_inode_flags {
-#define x(t, n)	__BCH_INODE_##t = n,
-	BCH_INODE_FLAGS()
-#undef x
-};
-
-LE32_BITMASK(INODE_STR_HASH,	struct bch_inode, bi_flags, 20, 24);
-LE32_BITMASK(INODE_NR_FIELDS,	struct bch_inode, bi_flags, 24, 31);
-LE32_BITMASK(INODE_NEW_VARINT,	struct bch_inode, bi_flags, 31, 32);
-
-LE64_BITMASK(INODEv2_STR_HASH,	struct bch_inode_v2, bi_flags, 20, 24);
-LE64_BITMASK(INODEv2_NR_FIELDS,	struct bch_inode_v2, bi_flags, 24, 31);
-
-LE64_BITMASK(INODEv3_STR_HASH,	struct bch_inode_v3, bi_flags, 20, 24);
-LE64_BITMASK(INODEv3_NR_FIELDS,	struct bch_inode_v3, bi_flags, 24, 31);
-
-LE64_BITMASK(INODEv3_FIELDS_START,
-				struct bch_inode_v3, bi_flags, 31, 36);
-LE64_BITMASK(INODEv3_MODE,	struct bch_inode_v3, bi_flags, 36, 52);
-
-#endif /* _BCACHEFS_INODE_FORMAT_H */
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
deleted file mode 100644
index 4ec979b4b23e..000000000000
--- a/fs/bcachefs/io_misc.c
+++ /dev/null
@@ -1,517 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * io_misc.c - fallocate, fpunch, truncate:
- */
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "clock.h"
-#include "error.h"
-#include "extents.h"
-#include "extent_update.h"
-#include "inode.h"
-#include "io_misc.h"
-#include "io_write.h"
-#include "logged_ops.h"
-#include "rebalance.h"
-#include "subvolume.h"
-
-/* Overwrites whatever was present with zeroes: */
-int bch2_extent_fallocate(struct btree_trans *trans,
-			  subvol_inum inum,
-			  struct btree_iter *iter,
-			  u64 sectors,
-			  struct bch_io_opts opts,
-			  s64 *i_sectors_delta,
-			  struct write_point_specifier write_point)
-{
-	struct bch_fs *c = trans->c;
-	struct disk_reservation disk_res = { 0 };
-	struct closure cl;
-	struct open_buckets open_buckets = { 0 };
-	struct bkey_s_c k;
-	struct bkey_buf old, new;
-	unsigned sectors_allocated = 0, new_replicas;
-	bool unwritten = opts.nocow &&
-	    c->sb.version >= bcachefs_metadata_version_unwritten_extents;
-	int ret;
-
-	bch2_bkey_buf_init(&old);
-	bch2_bkey_buf_init(&new);
-	closure_init_stack(&cl);
-
-	k = bch2_btree_iter_peek_slot(iter);
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
-	new_replicas = max(0, (int) opts.data_replicas -
-			   (int) bch2_bkey_nr_ptrs_fully_allocated(k));
-
-	/*
-	 * Get a disk reservation before (in the nocow case) calling
-	 * into the allocator:
-	 */
-	ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
-	if (unlikely(ret))
-		goto err_noprint;
-
-	bch2_bkey_buf_reassemble(&old, c, k);
-
-	if (!unwritten) {
-		struct bkey_i_reservation *reservation;
-
-		bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
-		reservation = bkey_reservation_init(new.k);
-		reservation->k.p = iter->pos;
-		bch2_key_resize(&reservation->k, sectors);
-		reservation->v.nr_replicas = opts.data_replicas;
-	} else {
-		struct bkey_i_extent *e;
-		struct bch_devs_list devs_have;
-		struct write_point *wp;
-
-		devs_have.nr = 0;
-
-		bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX);
-
-		e = bkey_extent_init(new.k);
-		e->k.p = iter->pos;
-
-		ret = bch2_alloc_sectors_start_trans(trans,
-				opts.foreground_target,
-				false,
-				write_point,
-				&devs_have,
-				opts.data_replicas,
-				opts.data_replicas,
-				BCH_WATERMARK_normal, 0, &cl, &wp);
-		if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
-			ret = -BCH_ERR_transaction_restart_nested;
-		if (ret)
-			goto err;
-
-		sectors = min_t(u64, sectors, wp->sectors_free);
-		sectors_allocated = sectors;
-
-		bch2_key_resize(&e->k, sectors);
-
-		bch2_open_bucket_get(c, wp, &open_buckets);
-		bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
-		bch2_alloc_sectors_done(c, wp);
-
-		extent_for_each_ptr(extent_i_to_s(e), ptr)
-			ptr->unwritten = true;
-	}
-
-	ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
-				 0, i_sectors_delta, true);
-err:
-	if (!ret && sectors_allocated)
-		bch2_increment_clock(c, sectors_allocated, WRITE);
-	if (should_print_err(ret))
-		bch_err_inum_offset_ratelimited(c,
-			inum.inum,
-			iter->pos.offset << 9,
-			"%s(): error: %s", __func__, bch2_err_str(ret));
-err_noprint:
-	bch2_open_buckets_put(c, &open_buckets);
-	bch2_disk_reservation_put(c, &disk_res);
-	bch2_bkey_buf_exit(&new, c);
-	bch2_bkey_buf_exit(&old, c);
-
-	if (closure_nr_remaining(&cl) != 1) {
-		bch2_trans_unlock(trans);
-		closure_sync(&cl);
-	}
-
-	return ret;
-}
-
-/*
- * Returns -BCH_ERR_transacton_restart if we had to drop locks:
- */
-int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
-		   subvol_inum inum, u64 end,
-		   s64 *i_sectors_delta)
-{
-	struct bch_fs *c	= trans->c;
-	unsigned max_sectors	= KEY_SIZE_MAX & (~0 << c->block_bits);
-	struct bpos end_pos = POS(inum.inum, end);
-	struct bkey_s_c k;
-	int ret = 0, ret2 = 0;
-	u32 snapshot;
-
-	while (!ret ||
-	       bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-		struct disk_reservation disk_res =
-			bch2_disk_reservation_init(c, 0);
-		struct bkey_i delete;
-
-		if (ret)
-			ret2 = ret;
-
-		bch2_trans_begin(trans);
-
-		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-		if (ret)
-			continue;
-
-		bch2_btree_iter_set_snapshot(iter, snapshot);
-
-		/*
-		 * peek_upto() doesn't have ideal semantics for extents:
-		 */
-		k = bch2_btree_iter_peek_upto(iter, end_pos);
-		if (!k.k)
-			break;
-
-		ret = bkey_err(k);
-		if (ret)
-			continue;
-
-		bkey_init(&delete.k);
-		delete.k.p = iter->pos;
-
-		/* create the biggest key we can */
-		bch2_key_resize(&delete.k, max_sectors);
-		bch2_cut_back(end_pos, &delete);
-
-		ret = bch2_extent_update(trans, inum, iter, &delete,
-				&disk_res, 0, i_sectors_delta, false);
-		bch2_disk_reservation_put(c, &disk_res);
-	}
-
-	return ret ?: ret2;
-}
-
-int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
-		s64 *i_sectors_delta)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-			     POS(inum.inum, start),
-			     BTREE_ITER_intent);
-
-	ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta);
-
-	bch2_trans_iter_exit(trans, &iter);
-	bch2_trans_put(trans);
-
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		ret = 0;
-
-	return ret;
-}
-
-/* truncate: */
-
-void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_s_c_logged_op_truncate op = bkey_s_c_to_logged_op_truncate(k);
-
-	prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol));
-	prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum));
-	prt_printf(out, " new_i_size=%llu", le64_to_cpu(op.v->new_i_size));
-}
-
-static int truncate_set_isize(struct btree_trans *trans,
-			      subvol_inum inum,
-			      u64 new_i_size)
-{
-	struct btree_iter iter = { NULL };
-	struct bch_inode_unpacked inode_u;
-	int ret;
-
-	ret   = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent) ?:
-		(inode_u.bi_size = new_i_size, 0) ?:
-		bch2_inode_write(trans, &iter, &inode_u);
-
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
-					    struct bkey_i *op_k,
-					    u64 *i_sectors_delta)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter fpunch_iter;
-	struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k);
-	subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
-	u64 new_i_size = le64_to_cpu(op->v.new_i_size);
-	int ret;
-
-	ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			truncate_set_isize(trans, inum, new_i_size));
-	if (ret)
-		goto err;
-
-	bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents,
-			     POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9),
-			     BTREE_ITER_intent);
-	ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta);
-	bch2_trans_iter_exit(trans, &fpunch_iter);
-
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		ret = 0;
-err:
-	bch2_logged_op_finish(trans, op_k);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-int bch2_resume_logged_op_truncate(struct btree_trans *trans, struct bkey_i *op_k)
-{
-	return __bch2_resume_logged_op_truncate(trans, op_k, NULL);
-}
-
-int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sectors_delta)
-{
-	struct bkey_i_logged_op_truncate op;
-
-	bkey_logged_op_truncate_init(&op.k_i);
-	op.v.subvol	= cpu_to_le32(inum.subvol);
-	op.v.inum	= cpu_to_le64(inum.inum);
-	op.v.new_i_size	= cpu_to_le64(new_i_size);
-
-	/*
-	 * Logged ops aren't atomic w.r.t. snapshot creation: creating a
-	 * snapshot while they're in progress, then crashing, will result in the
-	 * resume only proceeding in one of the snapshots
-	 */
-	down_read(&c->snapshot_create_lock);
-	int ret = bch2_trans_run(c,
-		bch2_logged_op_start(trans, &op.k_i) ?:
-		__bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta));
-	up_read(&c->snapshot_create_lock);
-
-	return ret;
-}
-
-/* finsert/fcollapse: */
-
-void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_s_c_logged_op_finsert op = bkey_s_c_to_logged_op_finsert(k);
-
-	prt_printf(out, "subvol=%u",		le32_to_cpu(op.v->subvol));
-	prt_printf(out, " inum=%llu",		le64_to_cpu(op.v->inum));
-	prt_printf(out, " dst_offset=%lli",	le64_to_cpu(op.v->dst_offset));
-	prt_printf(out, " src_offset=%llu",	le64_to_cpu(op.v->src_offset));
-}
-
-static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset, s64 len)
-{
-	struct btree_iter iter;
-	struct bch_inode_unpacked inode_u;
-	int ret;
-
-	offset	<<= 9;
-	len	<<= 9;
-
-	ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent);
-	if (ret)
-		return ret;
-
-	if (len > 0) {
-		if (MAX_LFS_FILESIZE - inode_u.bi_size < len) {
-			ret = -EFBIG;
-			goto err;
-		}
-
-		if (offset >= inode_u.bi_size) {
-			ret = -EINVAL;
-			goto err;
-		}
-	}
-
-	inode_u.bi_size += len;
-	inode_u.bi_mtime = inode_u.bi_ctime = bch2_current_time(trans->c);
-
-	ret = bch2_inode_write(trans, &iter, &inode_u);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
-					   struct bkey_i *op_k,
-					   u64 *i_sectors_delta)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k);
-	subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
-	struct bch_io_opts opts;
-	u64 dst_offset = le64_to_cpu(op->v.dst_offset);
-	u64 src_offset = le64_to_cpu(op->v.src_offset);
-	s64 shift = dst_offset - src_offset;
-	u64 len = abs(shift);
-	u64 pos = le64_to_cpu(op->v.pos);
-	bool insert = shift > 0;
-	int ret = 0;
-
-	ret = bch2_inum_opts_get(trans, inum, &opts);
-	if (ret)
-		return ret;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-			     POS(inum.inum, 0),
-			     BTREE_ITER_intent);
-
-	switch (op->v.state) {
-case LOGGED_OP_FINSERT_start:
-	op->v.state = LOGGED_OP_FINSERT_shift_extents;
-
-	if (insert) {
-		ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-				adjust_i_size(trans, inum, src_offset, len) ?:
-				bch2_logged_op_update(trans, &op->k_i));
-		if (ret)
-			goto err;
-	} else {
-		bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset));
-
-		ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta);
-		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			goto err;
-
-		ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-				bch2_logged_op_update(trans, &op->k_i));
-	}
-
-	fallthrough;
-case LOGGED_OP_FINSERT_shift_extents:
-	while (1) {
-		struct disk_reservation disk_res =
-			bch2_disk_reservation_init(c, 0);
-		struct bkey_i delete, *copy;
-		struct bkey_s_c k;
-		struct bpos src_pos = POS(inum.inum, src_offset);
-		u32 snapshot;
-
-		bch2_trans_begin(trans);
-
-		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-		if (ret)
-			goto btree_err;
-
-		bch2_btree_iter_set_snapshot(&iter, snapshot);
-		bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot));
-
-		k = insert
-			? bch2_btree_iter_peek_prev(&iter)
-			: bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX));
-		if ((ret = bkey_err(k)))
-			goto btree_err;
-
-		if (!k.k ||
-		    k.k->p.inode != inum.inum ||
-		    bkey_le(k.k->p, POS(inum.inum, src_offset)))
-			break;
-
-		copy = bch2_bkey_make_mut_noupdate(trans, k);
-		if ((ret = PTR_ERR_OR_ZERO(copy)))
-			goto btree_err;
-
-		if (insert &&
-		    bkey_lt(bkey_start_pos(k.k), src_pos)) {
-			bch2_cut_front(src_pos, copy);
-
-			/* Splitting compressed extent? */
-			bch2_disk_reservation_add(c, &disk_res,
-					copy->k.size *
-					bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy)),
-					BCH_DISK_RESERVATION_NOFAIL);
-		}
-
-		bkey_init(&delete.k);
-		delete.k.p = copy->k.p;
-		delete.k.p.snapshot = snapshot;
-		delete.k.size = copy->k.size;
-
-		copy->k.p.offset += shift;
-		copy->k.p.snapshot = snapshot;
-
-		op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
-
-		ret =   bch2_bkey_set_needs_rebalance(c, copy, &opts) ?:
-			bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
-			bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
-			bch2_logged_op_update(trans, &op->k_i) ?:
-			bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc);
-btree_err:
-		bch2_disk_reservation_put(c, &disk_res);
-
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			continue;
-		if (ret)
-			goto err;
-
-		pos = le64_to_cpu(op->v.pos);
-	}
-
-	op->v.state = LOGGED_OP_FINSERT_finish;
-
-	if (!insert) {
-		ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-				adjust_i_size(trans, inum, src_offset, shift) ?:
-				bch2_logged_op_update(trans, &op->k_i));
-	} else {
-		/* We need an inode update to update bi_journal_seq for fsync: */
-		ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-				adjust_i_size(trans, inum, 0, 0) ?:
-				bch2_logged_op_update(trans, &op->k_i));
-	}
-
-	break;
-case LOGGED_OP_FINSERT_finish:
-	break;
-	}
-err:
-	bch_err_fn(c, ret);
-	bch2_logged_op_finish(trans, op_k);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_resume_logged_op_finsert(struct btree_trans *trans, struct bkey_i *op_k)
-{
-	return __bch2_resume_logged_op_finsert(trans, op_k, NULL);
-}
-
-int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum,
-			   u64 offset, u64 len, bool insert,
-			   s64 *i_sectors_delta)
-{
-	struct bkey_i_logged_op_finsert op;
-	s64 shift = insert ? len : -len;
-
-	bkey_logged_op_finsert_init(&op.k_i);
-	op.v.subvol	= cpu_to_le32(inum.subvol);
-	op.v.inum	= cpu_to_le64(inum.inum);
-	op.v.dst_offset	= cpu_to_le64(offset + shift);
-	op.v.src_offset	= cpu_to_le64(offset);
-	op.v.pos	= cpu_to_le64(insert ? U64_MAX : offset);
-
-	/*
-	 * Logged ops aren't atomic w.r.t. snapshot creation: creating a
-	 * snapshot while they're in progress, then crashing, will result in the
-	 * resume only proceeding in one of the snapshots
-	 */
-	down_read(&c->snapshot_create_lock);
-	int ret = bch2_trans_run(c,
-		bch2_logged_op_start(trans, &op.k_i) ?:
-		__bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta));
-	up_read(&c->snapshot_create_lock);
-
-	return ret;
-}
diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h
deleted file mode 100644
index 9cb44a7c43c1..000000000000
--- a/fs/bcachefs/io_misc.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_MISC_H
-#define _BCACHEFS_IO_MISC_H
-
-int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
-			  u64, struct bch_io_opts, s64 *,
-			  struct write_point_specifier);
-int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
-		   subvol_inum, u64, s64 *);
-int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
-
-void bch2_logged_op_truncate_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_logged_op_truncate ((struct bkey_ops) {	\
-	.val_to_text	= bch2_logged_op_truncate_to_text,	\
-	.min_val_size	= 24,					\
-})
-
-int bch2_resume_logged_op_truncate(struct btree_trans *, struct bkey_i *);
-
-int bch2_truncate(struct bch_fs *, subvol_inum, u64, u64 *);
-
-void bch2_logged_op_finsert_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_logged_op_finsert ((struct bkey_ops) {	\
-	.val_to_text	= bch2_logged_op_finsert_to_text,	\
-	.min_val_size	= 24,					\
-})
-
-int bch2_resume_logged_op_finsert(struct btree_trans *, struct bkey_i *);
-
-int bch2_fcollapse_finsert(struct bch_fs *, subvol_inum, u64, u64, bool, s64 *);
-
-#endif /* _BCACHEFS_IO_MISC_H */
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
deleted file mode 100644
index f57486794484..000000000000
--- a/fs/bcachefs/io_read.c
+++ /dev/null
@@ -1,1228 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Some low level IO code, and hacks for various block layer limitations
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "clock.h"
-#include "compress.h"
-#include "data_update.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "error.h"
-#include "io_read.h"
-#include "io_misc.h"
-#include "io_write.h"
-#include "subvolume.h"
-#include "trace.h"
-
-#include <linux/sched/mm.h>
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-
-static bool bch2_target_congested(struct bch_fs *c, u16 target)
-{
-	const struct bch_devs_mask *devs;
-	unsigned d, nr = 0, total = 0;
-	u64 now = local_clock(), last;
-	s64 congested;
-	struct bch_dev *ca;
-
-	if (!target)
-		return false;
-
-	rcu_read_lock();
-	devs = bch2_target_to_mask(c, target) ?:
-		&c->rw_devs[BCH_DATA_user];
-
-	for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
-		ca = rcu_dereference(c->devs[d]);
-		if (!ca)
-			continue;
-
-		congested = atomic_read(&ca->congested);
-		last = READ_ONCE(ca->congested_last);
-		if (time_after64(now, last))
-			congested -= (now - last) >> 12;
-
-		total += max(congested, 0LL);
-		nr++;
-	}
-	rcu_read_unlock();
-
-	return bch2_rand_range(nr * CONGESTED_MAX) < total;
-}
-
-#else
-
-static bool bch2_target_congested(struct bch_fs *c, u16 target)
-{
-	return false;
-}
-
-#endif
-
-/* Cache promotion on read */
-
-struct promote_op {
-	struct rcu_head		rcu;
-	u64			start_time;
-
-	struct rhash_head	hash;
-	struct bpos		pos;
-
-	struct data_update	write;
-	struct bio_vec		bi_inline_vecs[]; /* must be last */
-};
-
-static const struct rhashtable_params bch_promote_params = {
-	.head_offset	= offsetof(struct promote_op, hash),
-	.key_offset	= offsetof(struct promote_op, pos),
-	.key_len	= sizeof(struct bpos),
-};
-
-static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
-				  struct bpos pos,
-				  struct bch_io_opts opts,
-				  unsigned flags)
-{
-	BUG_ON(!opts.promote_target);
-
-	if (!(flags & BCH_READ_MAY_PROMOTE))
-		return -BCH_ERR_nopromote_may_not;
-
-	if (bch2_bkey_has_target(c, k, opts.promote_target))
-		return -BCH_ERR_nopromote_already_promoted;
-
-	if (bkey_extent_is_unwritten(k))
-		return -BCH_ERR_nopromote_unwritten;
-
-	if (bch2_target_congested(c, opts.promote_target))
-		return -BCH_ERR_nopromote_congested;
-
-	if (rhashtable_lookup_fast(&c->promote_table, &pos,
-				   bch_promote_params))
-		return -BCH_ERR_nopromote_in_flight;
-
-	return 0;
-}
-
-static void promote_free(struct bch_fs *c, struct promote_op *op)
-{
-	int ret;
-
-	bch2_data_update_exit(&op->write);
-
-	ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
-				     bch_promote_params);
-	BUG_ON(ret);
-	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
-	kfree_rcu(op, rcu);
-}
-
-static void promote_done(struct bch_write_op *wop)
-{
-	struct promote_op *op =
-		container_of(wop, struct promote_op, write.op);
-	struct bch_fs *c = op->write.op.c;
-
-	bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
-			       op->start_time);
-	promote_free(c, op);
-}
-
-static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
-{
-	struct bio *bio = &op->write.op.wbio.bio;
-
-	trace_and_count(op->write.op.c, read_promote, &rbio->bio);
-
-	/* we now own pages: */
-	BUG_ON(!rbio->bounce);
-	BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
-
-	memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
-	       sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
-	swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
-
-	bch2_data_update_read_done(&op->write, rbio->pick.crc);
-}
-
-static struct promote_op *__promote_alloc(struct btree_trans *trans,
-					  enum btree_id btree_id,
-					  struct bkey_s_c k,
-					  struct bpos pos,
-					  struct extent_ptr_decoded *pick,
-					  struct bch_io_opts opts,
-					  unsigned sectors,
-					  struct bch_read_bio **rbio)
-{
-	struct bch_fs *c = trans->c;
-	struct promote_op *op = NULL;
-	struct bio *bio;
-	unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
-	int ret;
-
-	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
-		return ERR_PTR(-BCH_ERR_nopromote_no_writes);
-
-	op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL);
-	if (!op) {
-		ret = -BCH_ERR_nopromote_enomem;
-		goto err;
-	}
-
-	op->
author	Bob Moore <robert.moore@intel.com>	2020-01-10 11:31:49 -0800
committer	Rafael J. Wysocki <rafael.j.wysocki@intel.com>	2020-01-13 11:52:48 +0100
commit	800ba7c5eaaa734e4bd66bf0441fc200bbcdca54 (patch)
tree	5754a1e050b45d9e3be2f91f713b0687c61b0c4d /tools/perf/scripts/python/bin/export-to-postgresql-report
parent	fbdd256fe701a680f6eab3fa93dbab1942ab6a9f (diff)