230 files changed, 8496 insertions, 4878 deletions
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
index 1923affcdc62..ee1b6b06a2fd 100644
--- a/fs/9p/cache.h
+++ b/fs/9p/cache.h
@@ -8,9 +8,8 @@
 #ifndef _9P_CACHE_H
 #define _9P_CACHE_H
 
-#include <linux/fscache.h>
-
 #ifdef CONFIG_9P_FSCACHE
+#include <linux/fscache.h>
 
 extern int v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses,
 					  const char *dev_name);
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 805151114e96..de009a33e0e2 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -41,14 +41,24 @@ void v9fs_fid_add(struct dentry *dentry, struct p9_fid **pfid)
 	*pfid = NULL;
 }
 
+static bool v9fs_is_writeable(int mode)
+{
+	if (mode & (P9_OWRITE|P9_ORDWR))
+		return true;
+	else
+		return false;
+}
+
 /**
  * v9fs_fid_find_inode - search for an open fid off of the inode list
  * @inode: return a fid pointing to a specific inode
+ * @want_writeable: only consider fids which are writeable
  * @uid: return a fid belonging to the specified user
+ * @any: ignore uid as a selection criteria
  *
  */
-
-static struct p9_fid *v9fs_fid_find_inode(struct inode *inode, kuid_t uid)
+struct p9_fid *v9fs_fid_find_inode(struct inode *inode, bool want_writeable,
+	kuid_t uid, bool any)
 {
 	struct hlist_head *h;
 	struct p9_fid *fid, *ret = NULL;
@@ -58,7 +68,12 @@ static struct p9_fid *v9fs_fid_find_inode(struct inode *inode, kuid_t uid)
 	spin_lock(&inode->i_lock);
 	h = (struct hlist_head *)&inode->i_private;
 	hlist_for_each_entry(fid, h, ilist) {
-		if (uid_eq(fid->uid, uid)) {
+		if (any || uid_eq(fid->uid, uid)) {
+			if (want_writeable && !v9fs_is_writeable(fid->mode)) {
+				p9_debug(P9_DEBUG_VFS, " mode: %x not writeable?\n",
+							fid->mode);
+				continue;
+			}
 			p9_fid_get(fid);
 			ret = fid;
 			break;
@@ -118,7 +133,7 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)
 		spin_unlock(&dentry->d_lock);
 	} else {
 		if (dentry->d_inode)
-			ret = v9fs_fid_find_inode(dentry->d_inode, uid);
+			ret = v9fs_fid_find_inode(dentry->d_inode, false, uid, any);
 	}
 
 	return ret;
@@ -299,28 +314,3 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 	return v9fs_fid_lookup_with_uid(dentry, uid, any);
 }
 
-struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
-{
-	int err;
-	struct p9_fid *fid, *ofid;
-
-	ofid = v9fs_fid_lookup_with_uid(dentry, GLOBAL_ROOT_UID, 0);
-	fid = clone_fid(ofid);
-	if (IS_ERR(fid))
-		goto error_out;
-	p9_fid_put(ofid);
-	/*
-	 * writeback fid will only be used to write back the
-	 * dirty pages. We always request for the open fid in read-write
-	 * mode so that a partial page write which result in page
-	 * read can work.
-	 */
-	err = p9_client_open(fid, O_RDWR);
-	if (err < 0) {
-		p9_fid_put(fid);
-		fid = ERR_PTR(err);
-		goto error_out;
-	}
-error_out:
-	return fid;
-}
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index 8a4e8cd12ca2..0c51889a60b3 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -7,14 +7,16 @@
 #ifndef FS_9P_FID_H
 #define FS_9P_FID_H
 #include <linux/list.h>
+#include "v9fs.h"
 
+struct p9_fid *v9fs_fid_find_inode(struct inode *inode, bool want_writeable,
+	kuid_t uid, bool any);
 struct p9_fid *v9fs_fid_lookup(struct dentry *dentry);
 static inline struct p9_fid *v9fs_parent_fid(struct dentry *dentry)
 {
 	return v9fs_fid_lookup(dentry->d_parent);
 }
 void v9fs_fid_add(struct dentry *dentry, struct p9_fid **fid);
-struct p9_fid *v9fs_writeback_fid(struct dentry *dentry);
 void v9fs_open_fid_add(struct inode *inode, struct p9_fid **fid);
 static inline struct p9_fid *clone_fid(struct p9_fid *fid)
 {
@@ -32,4 +34,31 @@ static inline struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
 	p9_fid_put(fid);
 	return nfid;
 }
+/**
+ * v9fs_fid_addmodes - add cache flags to fid mode (for client use only)
+ * @fid: fid to augment
+ * @s_flags: session info mount flags
+ * @s_cache: session info cache flags
+ * @f_flags: unix open flags
+ *
+ * make sure mode reflects flags of underlying mounts
+ * also qid.version == 0 reflects a synthetic or legacy file system
+ * NOTE: these are set after open so only reflect 9p client not
+ * underlying file system on server.
+ */
+static inline void v9fs_fid_add_modes(struct p9_fid *fid, int s_flags,
+	int s_cache, unsigned int f_flags)
+{
+	if (fid->qid.type != P9_QTFILE)
+		return;
+
+	if ((!s_cache) ||
+	   ((fid->qid.version == 0) && !(s_flags & V9FS_IGNORE_QV)) ||
+	   (s_flags & V9FS_DIRECT_IO) || (f_flags & O_DIRECT)) {
+		fid->mode |= P9L_DIRECT; /* no read or write cache */
+	} else if ((!(s_cache & CACHE_WRITEBACK)) ||
+				(f_flags & O_DSYNC) | (s_flags & V9FS_SYNC)) {
+		fid->mode |= P9L_NOWRITECACHE;
+	}
+}
 #endif
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 61a51b90600d..c7f774fe398f 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -38,9 +38,7 @@ enum {
 	/* String options */
 	Opt_uname, Opt_remotename, Opt_cache, Opt_cachetag,
 	/* Options that take no arguments */
-	Opt_nodevmap,
-	/* Cache options */
-	Opt_cache_loose, Opt_fscache, Opt_mmap,
+	Opt_nodevmap, Opt_noxattr, Opt_directio, Opt_ignoreqv,
 	/* Access options */
 	Opt_access, Opt_posixacl,
 	/* Lock timeout option */
@@ -57,10 +55,10 @@ static const match_table_t tokens = {
 	{Opt_uname, "uname=%s"},
 	{Opt_remotename, "aname=%s"},
 	{Opt_nodevmap, "nodevmap"},
+	{Opt_noxattr, "noxattr"},
+	{Opt_directio, "directio"},
+	{Opt_ignoreqv, "ignoreqv"},
 	{Opt_cache, "cache=%s"},
-	{Opt_cache_loose, "loose"},
-	{Opt_fscache, "fscache"},
-	{Opt_mmap, "mmap"},
 	{Opt_cachetag, "cachetag=%s"},
 	{Opt_access, "access=%s"},
 	{Opt_posixacl, "posixacl"},
@@ -68,32 +66,30 @@ static const match_table_t tokens = {
 	{Opt_err, NULL}
 };
 
-static const char *const v9fs_cache_modes[nr__p9_cache_modes] = {
-	[CACHE_NONE]	= "none",
-	[CACHE_MMAP]	= "mmap",
-	[CACHE_LOOSE]	= "loose",
-	[CACHE_FSCACHE]	= "fscache",
-};
-
 /* Interpret mount options for cache mode */
 static int get_cache_mode(char *s)
 {
 	int version = -EINVAL;
 
 	if (!strcmp(s, "loose")) {
-		version = CACHE_LOOSE;
+		version = CACHE_SC_LOOSE;
 		p9_debug(P9_DEBUG_9P, "Cache mode: loose\n");
 	} else if (!strcmp(s, "fscache")) {
-		version = CACHE_FSCACHE;
+		version = CACHE_SC_FSCACHE;
 		p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n");
 	} else if (!strcmp(s, "mmap")) {
-		version = CACHE_MMAP;
+		version = CACHE_SC_MMAP;
 		p9_debug(P9_DEBUG_9P, "Cache mode: mmap\n");
+	} else if (!strcmp(s, "readahead")) {
+		version = CACHE_SC_READAHEAD;
+		p9_debug(P9_DEBUG_9P, "Cache mode: readahead\n");
 	} else if (!strcmp(s, "none")) {
-		version = CACHE_NONE;
+		version = CACHE_SC_NONE;
 		p9_debug(P9_DEBUG_9P, "Cache mode: none\n");
-	} else
-		pr_info("Unknown Cache mode %s\n", s);
+	} else if (kstrtoint(s, 0, &version) != 0) {
+		version = -EINVAL;
+		pr_info("Unknown Cache mode or invalid value %s\n", s);
+	}
 	return version;
 }
 
@@ -121,9 +117,9 @@ int v9fs_show_options(struct seq_file *m, struct dentry *root)
 	if (v9ses->nodev)
 		seq_puts(m, ",nodevmap");
 	if (v9ses->cache)
-		seq_printf(m, ",%s", v9fs_cache_modes[v9ses->cache]);
+		seq_printf(m, ",cache=%x", v9ses->cache);
 #ifdef CONFIG_9P_FSCACHE
-	if (v9ses->cachetag && v9ses->cache == CACHE_FSCACHE)
+	if (v9ses->cachetag && (v9ses->cache & CACHE_FSCACHE))
 		seq_printf(m, ",cachetag=%s", v9ses->cachetag);
 #endif
 
@@ -143,9 +139,16 @@ int v9fs_show_options(struct seq_file *m, struct dentry *root)
 		break;
 	}
 
+	if (v9ses->flags & V9FS_IGNORE_QV)
+		seq_puts(m, ",ignoreqv");
+	if (v9ses->flags & V9FS_DIRECT_IO)
+		seq_puts(m, ",directio");
 	if (v9ses->flags & V9FS_POSIX_ACL)
 		seq_puts(m, ",posixacl");
 
+	if (v9ses->flags & V9FS_NO_XATTR)
+		seq_puts(m, ",noxattr");
+
 	return p9_show_client_options(m, v9ses->clnt);
 }
 
@@ -266,14 +269,14 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 		case Opt_nodevmap:
 			v9ses->nodev = 1;
 			break;
-		case Opt_cache_loose:
-			v9ses->cache = CACHE_LOOSE;
+		case Opt_noxattr:
+			v9ses->flags |= V9FS_NO_XATTR;
 			break;
-		case Opt_fscache:
-			v9ses->cache = CACHE_FSCACHE;
+		case Opt_directio:
+			v9ses->flags |= V9FS_DIRECT_IO;
 			break;
-		case Opt_mmap:
-			v9ses->cache = CACHE_MMAP;
+		case Opt_ignoreqv:
+			v9ses->flags |= V9FS_IGNORE_QV;
 			break;
 		case Opt_cachetag:
 #ifdef CONFIG_9P_FSCACHE
@@ -468,7 +471,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 
 #ifdef CONFIG_9P_FSCACHE
 	/* register the session for caching */
-	if (v9ses->cache == CACHE_FSCACHE) {
+	if (v9ses->cache & CACHE_FSCACHE) {
 		rc = v9fs_cache_session_get_cookie(v9ses, dev_name);
 		if (rc < 0)
 			goto err_clnt;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index f3f74d197b5d..06a2514f0d88 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -31,29 +31,54 @@
 #define V9FS_ACL_MASK V9FS_POSIX_ACL
 
 enum p9_session_flags {
-	V9FS_PROTO_2000U	= 0x01,
-	V9FS_PROTO_2000L	= 0x02,
-	V9FS_ACCESS_SINGLE	= 0x04,
-	V9FS_ACCESS_USER	= 0x08,
-	V9FS_ACCESS_CLIENT	= 0x10,
-	V9FS_POSIX_ACL		= 0x20
+	V9FS_PROTO_2000U    = 0x01,
+	V9FS_PROTO_2000L    = 0x02,
+	V9FS_ACCESS_SINGLE  = 0x04,
+	V9FS_ACCESS_USER    = 0x08,
+	V9FS_ACCESS_CLIENT  = 0x10,
+	V9FS_POSIX_ACL      = 0x20,
+	V9FS_NO_XATTR       = 0x40,
+	V9FS_IGNORE_QV      = 0x80, /* ignore qid.version for cache hints */
+	V9FS_DIRECT_IO      = 0x100,
+	V9FS_SYNC           = 0x200
 };
 
-/* possible values of ->cache */
 /**
- * enum p9_cache_modes - user specified cache preferences
- * @CACHE_NONE: do not cache data, dentries, or directory contents (default)
- * @CACHE_LOOSE: cache data, dentries, and directory contents w/no consistency
+ * enum p9_cache_shortcuts - human readable cache preferences
+ * @CACHE_SC_NONE: disable all caches
+ * @CACHE_SC_READAHEAD: only provide caching for readahead
+ * @CACHE_SC_MMAP: provide caching to enable mmap
+ * @CACHE_SC_LOOSE: non-coherent caching for files and meta data
+ * @CACHE_SC_FSCACHE: persistent non-coherent caching for files and meta-data
  *
- * eventually support loose, tight, time, session, default always none
  */
 
-enum p9_cache_modes {
-	CACHE_NONE,
-	CACHE_MMAP,
-	CACHE_LOOSE,
-	CACHE_FSCACHE,
-	nr__p9_cache_modes
+enum p9_cache_shortcuts {
+	CACHE_SC_NONE       = 0b00000000,
+	CACHE_SC_READAHEAD  = 0b00000001,
+	CACHE_SC_MMAP       = 0b00000101,
+	CACHE_SC_LOOSE      = 0b00001111,
+	CACHE_SC_FSCACHE    = 0b10001111,
+};
+
+/**
+ * enum p9_cache_bits - possible values of ->cache
+ * @CACHE_NONE: caches disabled
+ * @CACHE_FILE: file caching (open to close)
+ * @CACHE_META: meta-data and directory caching
+ * @CACHE_WRITEBACK: write-back caching for files
+ * @CACHE_LOOSE: don't check cache consistency
+ * @CACHE_FSCACHE: local persistent caches
+ *
+ */
+
+enum p9_cache_bits {
+	CACHE_NONE          = 0b00000000,
+	CACHE_FILE          = 0b00000001,
+	CACHE_META          = 0b00000010,
+	CACHE_WRITEBACK     = 0b00000100,
+	CACHE_LOOSE         = 0b00001000,
+	CACHE_FSCACHE       = 0b10000000,
 };
 
 /**
@@ -62,7 +87,7 @@ enum p9_cache_modes {
  * @nodev: set to 1 to disable device mapping
  * @debug: debug level
  * @afid: authentication handle
- * @cache: cache mode of type &p9_cache_modes
+ * @cache: cache mode of type &p9_cache_bits
  * @cachetag: the tag of the cache associated with this session
  * @fscache: session cookie associated with FS-Cache
  * @uname: string user name to mount hierarchy as
@@ -112,7 +137,6 @@ struct v9fs_inode {
 	struct netfs_inode netfs; /* Netfslib context and vfs inode */
 	struct p9_qid qid;
 	unsigned int cache_validity;
-	struct p9_fid *writeback_fid;
 	struct mutex v_mutex;
 };
 
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 75106b9f293d..cdf441f22e07 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -36,10 +36,6 @@ extern const struct file_operations v9fs_dir_operations;
 extern const struct file_operations v9fs_dir_operations_dotl;
 extern const struct dentry_operations v9fs_dentry_operations;
 extern const struct dentry_operations v9fs_cached_dentry_operations;
-extern const struct file_operations v9fs_cached_file_operations;
-extern const struct file_operations v9fs_cached_file_operations_dotl;
-extern const struct file_operations v9fs_mmap_file_operations;
-extern const struct file_operations v9fs_mmap_file_operations_dotl;
 extern struct kmem_cache *v9fs_inode_cache;
 
 struct inode *v9fs_alloc_inode(struct super_block *sb);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 425956eb9fde..8a635999a7d6 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -56,8 +56,6 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
  */
 static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
 {
-	struct inode *inode = file_inode(file);
-	struct v9fs_inode *v9inode = V9FS_I(inode);
 	struct p9_fid *fid = file->private_data;
 
 	BUG_ON(!fid);
@@ -65,11 +63,8 @@ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
 	/* we might need to read from a fid that was opened write-only
 	 * for read-modify-write of page cache, use the writeback fid
 	 * for that */
-	if (rreq->origin == NETFS_READ_FOR_WRITE &&
-			(fid->mode & O_ACCMODE) == O_WRONLY) {
-		fid = v9inode->writeback_fid;
-		BUG_ON(!fid);
-	}
+	WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE &&
+			!(fid->mode & P9_ORDWR));
 
 	p9_fid_get(fid);
 	rreq->netfs_priv = fid;
@@ -119,8 +114,6 @@ const struct netfs_request_ops v9fs_req_ops = {
 
 static bool v9fs_release_folio(struct folio *folio, gfp_t gfp)
 {
-	struct inode *inode = folio_inode(folio);
-
 	if (folio_test_private(folio))
 		return false;
 #ifdef CONFIG_9P_FSCACHE
@@ -129,8 +122,8 @@ static bool v9fs_release_folio(struct folio *folio, gfp_t gfp)
 			return false;
 		folio_wait_fscache(folio);
 	}
+	fscache_note_page_release(v9fs_inode_cookie(V9FS_I(folio_inode(folio))));
 #endif
-	fscache_note_page_release(v9fs_inode_cookie(V9FS_I(inode)));
 	return true;
 }
 
@@ -140,6 +133,7 @@ static void v9fs_invalidate_folio(struct folio *folio, size_t offset,
 	folio_wait_fscache(folio);
 }
 
+#ifdef CONFIG_9P_FSCACHE
 static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error,
 				     bool was_async)
 {
@@ -153,17 +147,19 @@ static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error,
 				   i_size_read(&v9inode->netfs.inode), 0);
 	}
 }
+#endif
 
 static int v9fs_vfs_write_folio_locked(struct folio *folio)
 {
 	struct inode *inode = folio_inode(folio);
-	struct v9fs_inode *v9inode = V9FS_I(inode);
-	struct fscache_cookie *cookie = v9fs_inode_cookie(v9inode);
 	loff_t start = folio_pos(folio);
 	loff_t i_size = i_size_read(inode);
 	struct iov_iter from;
 	size_t len = folio_size(folio);
+	struct p9_fid *writeback_fid;
 	int err;
+	struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode);
+	struct fscache_cookie __maybe_unused *cookie = v9fs_inode_cookie(v9inode);
 
 	if (start >= i_size)
 		return 0; /* Simultaneous truncation occurred */
@@ -172,25 +168,33 @@ static int v9fs_vfs_write_folio_locked(struct folio *folio)
 
 	iov_iter_xarray(&from, ITER_SOURCE, &folio_mapping(folio)->i_pages, start, len);
 
-	/* We should have writeback_fid always set */
-	BUG_ON(!v9inode->writeback_fid);
+	writeback_fid = v9fs_fid_find_inode(inode, true, INVALID_UID, true);
+	if (!writeback_fid) {
+		WARN_ONCE(1, "folio expected an open fid inode->i_private=%p\n",
+			inode->i_private);
+		return -EINVAL;
+	}
 
 	folio_wait_fscache(folio);
 	folio_start_writeback(folio);
 
-	p9_client_write(v9inode->writeback_fid, start, &from, &err);
+	p9_client_write(writeback_fid, start, &from, &err);
 
+#ifdef CONFIG_9P_FSCACHE
 	if (err == 0 &&
-	    fscache_cookie_enabled(cookie) &&
-	    test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) {
+		fscache_cookie_enabled(cookie) &&
+		test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) {
 		folio_start_fscache(folio);
 		fscache_write_to_cache(v9fs_inode_cookie(v9inode),
-				       folio_mapping(folio), start, len, i_size,
-				       v9fs_write_to_cache_done, v9inode,
-				       true);
+					folio_mapping(folio), start, len, i_size,
+					v9fs_write_to_cache_done, v9inode,
+					true);
 	}
+#endif
 
 	folio_end_writeback(folio);
+	p9_fid_put(writeback_fid);
+
 	return err;
 }
 
@@ -297,7 +301,6 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
 	loff_t last_pos = pos + copied;
 	struct folio *folio = page_folio(subpage);
 	struct inode *inode = mapping->host;
-	struct v9fs_inode *v9inode = V9FS_I(inode);
 
 	p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
 
@@ -317,7 +320,10 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
 	if (last_pos > inode->i_size) {
 		inode_add_bytes(inode, last_pos - inode->i_size);
 		i_size_write(inode, last_pos);
-		fscache_update_cookie(v9fs_inode_cookie(v9inode), NULL, &last_pos);
+#ifdef CONFIG_9P_FSCACHE
+		fscache_update_cookie(v9fs_inode_cookie(V9FS_I(inode)), NULL,
+			&last_pos);
+#endif
 	}
 	folio_mark_dirty(folio);
 out:
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 52bf87934650..45b684b7d8d7 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -196,9 +196,9 @@ static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx)
 
 
 /**
- * v9fs_dir_release - called on a close of a file or directory
- * @inode: inode of the directory
- * @filp: file pointer to a directory
+ * v9fs_dir_release - close a directory or a file
+ * @inode: inode of the directory or file
+ * @filp: file pointer to a directory or file
  *
  */
 
@@ -213,7 +213,11 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
 	fid = filp->private_data;
 	p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n",
 		 inode, filp, fid ? fid->fid : -1);
+
 	if (fid) {
+		if ((S_ISREG(inode->i_mode)) && (filp->f_mode & FMODE_WRITE))
+			retval = filemap_fdatawrite(inode->i_mapping);
+
 		spin_lock(&inode->i_lock);
 		hlist_del(&fid->ilist);
 		spin_unlock(&inode->i_lock);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 367a851eaa82..6c31b8c8112d 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -28,7 +28,6 @@
 #include "fid.h"
 #include "cache.h"
 
-static const struct vm_operations_struct v9fs_file_vm_ops;
 static const struct vm_operations_struct v9fs_mmap_file_vm_ops;
 
 /**
@@ -41,13 +40,11 @@ static const struct vm_operations_struct v9fs_mmap_file_vm_ops;
 int v9fs_file_open(struct inode *inode, struct file *file)
 {
 	int err;
-	struct v9fs_inode *v9inode;
 	struct v9fs_session_info *v9ses;
-	struct p9_fid *fid, *writeback_fid;
+	struct p9_fid *fid;
 	int omode;
 
 	p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
-	v9inode = V9FS_I(inode);
 	v9ses = v9fs_inode2v9ses(inode);
 	if (v9fs_proto_dotl(v9ses))
 		omode = v9fs_open_to_dotl_flags(file->f_flags);
@@ -60,7 +57,19 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 		if (IS_ERR(fid))
 			return PTR_ERR(fid);
 
-		err = p9_client_open(fid, omode);
+		if ((v9ses->cache & CACHE_WRITEBACK) && (omode & P9_OWRITE)) {
+			int writeback_omode = (omode & ~P9_OWRITE) | P9_ORDWR;
+
+			p9_debug(P9_DEBUG_CACHE, "write-only file with writeback enabled, try opening O_RDWR\n");
+			err = p9_client_open(fid, writeback_omode);
+			if (err < 0) {
+				p9_debug(P9_DEBUG_CACHE, "could not open O_RDWR, disabling caches\n");
+				err = p9_client_open(fid, omode);
+				fid->mode |= P9L_DIRECT;
+			}
+		} else {
+			err = p9_client_open(fid, omode);
+		}
 		if (err < 0) {
 			p9_fid_put(fid);
 			return err;
@@ -72,36 +81,14 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 		file->private_data = fid;
 	}
 
-	mutex_lock(&v9inode->v_mutex);
-	if ((v9ses->cache) && !v9inode->writeback_fid &&
-	    ((file->f_flags & O_ACCMODE) != O_RDONLY)) {
-		/*
-		 * clone a fid and add it to writeback_fid
-		 * we do it during open time instead of
-		 * page dirty time via write_begin/page_mkwrite
-		 * because we want write after unlink usecase
-		 * to work.
-		 */
-		writeback_fid = v9fs_writeback_fid(file_dentry(file));
-		if (IS_ERR(writeback_fid)) {
-			err = PTR_ERR(writeback_fid);
-			mutex_unlock(&v9inode->v_mutex);
-			goto out_error;
-		}
-		v9inode->writeback_fid = (void *) writeback_fid;
-	}
-	mutex_unlock(&v9inode->v_mutex);
 #ifdef CONFIG_9P_FSCACHE
-	if (v9ses->cache == CACHE_FSCACHE)
-		fscache_use_cookie(v9fs_inode_cookie(v9inode),
+	if (v9ses->cache & CACHE_FSCACHE)
+		fscache_use_cookie(v9fs_inode_cookie(V9FS_I(inode)),
 				   file->f_mode & FMODE_WRITE);
 #endif
+	v9fs_fid_add_modes(fid, v9ses->flags, v9ses->cache, file->f_flags);
 	v9fs_open_fid_add(inode, &fid);
 	return 0;
-out_error:
-	p9_fid_put(file->private_data);
-	file->private_data = NULL;
-	return err;
 }
 
 /**
@@ -368,8 +355,13 @@ v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	struct p9_fid *fid = iocb->ki_filp->private_data;
 	int ret, err = 0;
 
-	p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n",
-		 iov_iter_count(to), iocb->ki_pos);
+	p9_debug(P9_DEBUG_VFS, "fid %d count %zu offset %lld\n",
+		 fid->fid, iov_iter_count(to), iocb->ki_pos);
+
+	if (!(fid->mode & P9L_DIRECT)) {
+		p9_debug(P9_DEBUG_VFS, "(cached)\n");
+		return generic_file_read_iter(iocb, to);
+	}
 
 	if (iocb->ki_filp->f_flags & O_NONBLOCK)
 		ret = p9_client_read_once(fid, iocb->ki_pos, to, &err);
@@ -392,10 +384,18 @@ static ssize_t
 v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
+	struct p9_fid *fid = file->private_data;
 	ssize_t retval;
 	loff_t origin;
 	int err = 0;
 
+	p9_debug(P9_DEBUG_VFS, "fid %d\n", fid->fid);
+
+	if (!(fid->mode & (P9L_DIRECT | P9L_NOWRITECACHE))) {
+		p9_debug(P9_DEBUG_CACHE, "(cached)\n");
+		return generic_file_write_iter(iocb, from);
+	}
+
 	retval = generic_write_checks(iocb, from);
 	if (retval <= 0)
 		return retval;
@@ -477,45 +477,18 @@ static int
 v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	int retval;
+	struct inode *inode = file_inode(filp);
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
 
+	p9_debug(P9_DEBUG_MMAP, "filp :%p\n", filp);
 
-	retval = generic_file_mmap(filp, vma);
-	if (!retval)
-		vma->vm_ops = &v9fs_file_vm_ops;
-
-	return retval;
-}
-
-static int
-v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma)
-{
-	int retval;
-	struct inode *inode;
-	struct v9fs_inode *v9inode;
-	struct p9_fid *fid;
-
-	inode = file_inode(filp);
-	v9inode = V9FS_I(inode);
-	mutex_lock(&v9inode->v_mutex);
-	if (!v9inode->writeback_fid &&
-	    (vma->vm_flags & VM_SHARED) &&
-	    (vma->vm_flags & VM_WRITE)) {
-		/*
-		 * clone a fid and add it to writeback_fid
-		 * we do it during mmap instead of
-		 * page dirty time via write_begin/page_mkwrite
-		 * because we want write after unlink usecase
-		 * to work.
-		 */
-		fid = v9fs_writeback_fid(file_dentry(filp));
-		if (IS_ERR(fid)) {
-			retval = PTR_ERR(fid);
-			mutex_unlock(&v9inode->v_mutex);
-			return retval;
-		}
-		v9inode->writeback_fid = (void *) fid;
+	if (!(v9ses->cache & CACHE_WRITEBACK)) {
+		p9_debug(P9_DEBUG_CACHE, "(no mmap mode)");
+		if (vma->vm_flags & VM_MAYSHARE)
+			return -ENODEV;
+		invalidate_inode_pages2(filp->f_mapping);
+		return generic_file_readonly_mmap(filp, vma);
 	}
-	mutex_unlock(&v9inode->v_mutex);
 
 	retval = generic_file_mmap(filp, vma);
 	if (!retval)
@@ -527,7 +500,6 @@ v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma)
 static vm_fault_t
 v9fs_vm_page_mkwrite(struct vm_fault *vmf)
 {
-	struct v9fs_inode *v9inode;
 	struct folio *folio = page_folio(vmf->page);
 	struct file *filp = vmf->vma->vm_file;
 	struct inode *inode = file_inode(filp);
@@ -536,8 +508,6 @@ v9fs_vm_page_mkwrite(struct vm_fault *vmf)
 	p9_debug(P9_DEBUG_VFS, "folio %p fid %lx\n",
 		 folio, (unsigned long)filp->private_data);
 
-	v9inode = V9FS_I(inode);
-
 	/* Wait for the page to be written to the cache before we allow it to
 	 * be modified.  We then assume the entire page will need writing back.
 	 */
@@ -550,7 +520,6 @@ v9fs_vm_page_mkwrite(struct vm_fault *vmf)
 	/* Update file times before taking page lock */
 	file_update_time(filp);
 
-	BUG_ON(!v9inode->writeback_fid);
 	if (folio_lock_killable(folio) < 0)
 		return VM_FAULT_RETRY;
 	if (folio_mapping(folio) != inode->i_mapping)
@@ -563,35 +532,6 @@ out_unlock:
 	return VM_FAULT_NOPAGE;
 }
 
-/**
- * v9fs_mmap_file_read_iter - read from a file
- * @iocb: The operation parameters
- * @to: The buffer to read into
- *
- */
-static ssize_t
-v9fs_mmap_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
-{
-	/* TODO: Check if there are dirty pages */
-	return v9fs_file_read_iter(iocb, to);
-}
-
-/**
- * v9fs_mmap_file_write_iter - write to a file
- * @iocb: The operation parameters
- * @from: The data to write
- *
- */
-static ssize_t
-v9fs_mmap_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
-	/*
-	 * TODO: invalidate mmaps on filp's inode between
-	 * offset and offset+count
-	 */
-	return v9fs_file_write_iter(iocb, from);
-}
-
 static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
 {
 	struct inode *inode;
@@ -614,13 +554,6 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
 	filemap_fdatawrite_wbc(inode->i_mapping, &wbc);
 }
 
-
-static const struct vm_operations_struct v9fs_file_vm_ops = {
-	.fault = filemap_fault,
-	.map_pages = filemap_map_pages,
-	.page_mkwrite = v9fs_vm_page_mkwrite,
-};
-
 static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
 	.close = v9fs_mmap_vm_close,
 	.fault = filemap_fault,
@@ -628,34 +561,6 @@ static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
 	.page_mkwrite = v9fs_vm_page_mkwrite,
 };
 
-
-const struct file_operations v9fs_cached_file_operations = {
-	.llseek = generic_file_llseek,
-	.read_iter = generic_file_read_iter,
-	.write_iter = generic_file_write_iter,
-	.open = v9fs_file_open,
-	.release = v9fs_dir_release,
-	.lock = v9fs_file_lock,
-	.mmap = v9fs_file_mmap,
-	.splice_read = generic_file_splice_read,
-	.splice_write = iter_file_splice_write,
-	.fsync = v9fs_file_fsync,
-};
-
-const struct file_operations v9fs_cached_file_operations_dotl = {
-	.llseek = generic_file_llseek,
-	.read_iter = generic_file_read_iter,
-	.write_iter = generic_file_write_iter,
-	.open = v9fs_file_open,
-	.release = v9fs_dir_release,
-	.lock = v9fs_file_lock_dotl,
-	.flock = v9fs_file_flock_dotl,
-	.mmap = v9fs_file_mmap,
-	.splice_read = generic_file_splice_read,
-	.splice_write = iter_file_splice_write,
-	.fsync = v9fs_file_fsync_dotl,
-};
-
 const struct file_operations v9fs_file_operations = {
 	.llseek = generic_file_llseek,
 	.read_iter = v9fs_file_read_iter,
@@ -677,34 +582,7 @@ const struct file_operations v9fs_file_operations_dotl = {
 	.release = v9fs_dir_release,
 	.lock = v9fs_file_lock_dotl,
 	.flock = v9fs_file_flock_dotl,
-	.mmap = generic_file_readonly_mmap,
-	.splice_read = generic_file_splice_read,
-	.splice_write = iter_file_splice_write,
-	.fsync = v9fs_file_fsync_dotl,
-};
-
-const struct file_operations v9fs_mmap_file_operations = {
-	.llseek = generic_file_llseek,
-	.read_iter = v9fs_mmap_file_read_iter,
-	.write_iter = v9fs_mmap_file_write_iter,
-	.open = v9fs_file_open,
-	.release = v9fs_dir_release,
-	.lock = v9fs_file_lock,
-	.mmap = v9fs_mmap_file_mmap,
-	.splice_read = generic_file_splice_read,
-	.splice_write = iter_file_splice_write,
-	.fsync = v9fs_file_fsync,
-};
-
-const struct file_operations v9fs_mmap_file_operations_dotl = {
-	.llseek = generic_file_llseek,
-	.read_iter = v9fs_mmap_file_read_iter,
-	.write_iter = v9fs_mmap_file_write_iter,
-	.open = v9fs_file_open,
-	.release = v9fs_dir_release,
-	.lock = v9fs_file_lock_dotl,
-	.flock = v9fs_file_flock_dotl,
-	.mmap = v9fs_mmap_file_mmap,
+	.mmap = v9fs_file_mmap,
 	.splice_read = generic_file_splice_read,
 	.splice_write = iter_file_splice_write,
 	.fsync = v9fs_file_fsync_dotl,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 502ac74e4959..36b466e35887 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -229,7 +229,6 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
 	v9inode = alloc_inode_sb(sb, v9fs_inode_cache, GFP_KERNEL);
 	if (!v9inode)
 		return NULL;
-	v9inode->writeback_fid = NULL;
 	v9inode->cache_validity = 0;
 	mutex_init(&v9inode->v_mutex);
 	return &v9inode->netfs.inode;
@@ -286,24 +285,10 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
 	case S_IFREG:
 		if (v9fs_proto_dotl(v9ses)) {
 			inode->i_op = &v9fs_file_inode_operations_dotl;
-			if (v9ses->cache == CACHE_LOOSE ||
-			    v9ses->cache == CACHE_FSCACHE)
-				inode->i_fop =
-					&v9fs_cached_file_operations_dotl;
-			else if (v9ses->cache == CACHE_MMAP)
-				inode->i_fop = &v9fs_mmap_file_operations_dotl;
-			else
-				inode->i_fop = &v9fs_file_operations_dotl;
+			inode->i_fop = &v9fs_file_operations_dotl;
 		} else {
 			inode->i_op = &v9fs_file_inode_operations;
-			if (v9ses->cache == CACHE_LOOSE ||
-			    v9ses->cache == CACHE_FSCACHE)
-				inode->i_fop =
-					&v9fs_cached_file_operations;
-			else if (v9ses->cache == CACHE_MMAP)
-				inode->i_fop = &v9fs_mmap_file_operations;
-			else
-				inode->i_fop = &v9fs_file_operations;
+			inode->i_fop = &v9fs_file_operations;
 		}
 
 		break;
@@ -385,20 +370,23 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev)
  */
 void v9fs_evict_inode(struct inode *inode)
 {
-	struct v9fs_inode *v9inode = V9FS_I(inode);
-	__le32 version;
+	struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode);
+	__le32 __maybe_unused version;
 
 	truncate_inode_pages_final(&inode->i_data);
+
+#ifdef CONFIG_9P_FSCACHE
 	version = cpu_to_le32(v9inode->qid.version);
 	fscache_clear_inode_writeback(v9fs_inode_cookie(v9inode), inode,
 				      &version);
+#endif
+
 	clear_inode(inode);
 	filemap_fdatawrite(&inode->i_data);
 
+#ifdef CONFIG_9P_FSCACHE
 	fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false);
-	/* clunk the fid stashed in writeback_fid */
-	p9_fid_put(v9inode->writeback_fid);
-	v9inode->writeback_fid = NULL;
+#endif
 }
 
 static int v9fs_test_inode(struct inode *inode, void *data)
@@ -778,7 +766,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 		inode = NULL;
 	else if (IS_ERR(fid))
 		inode = ERR_CAST(fid);
-	else if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+	else if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
 		inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
 	else
 		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
@@ -807,11 +795,12 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
 {
 	int err;
 	u32 perm;
-	struct v9fs_inode *v9inode;
+	struct v9fs_inode __maybe_unused *v9inode;
 	struct v9fs_session_info *v9ses;
-	struct p9_fid *fid, *inode_fid;
+	struct p9_fid *fid;
 	struct dentry *res = NULL;
 	struct inode *inode;
+	int p9_omode;
 
 	if (d_in_lookup(dentry)) {
 		res = v9fs_vfs_lookup(dir, dentry, 0);
@@ -830,9 +819,14 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
 
 	v9ses = v9fs_inode2v9ses(dir);
 	perm = unixmode2p9mode(v9ses, mode);
-	fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
-				v9fs_uflags2omode(flags,
-						v9fs_proto_dotu(v9ses)));
+	p9_omode = v9fs_uflags2omode(flags, v9fs_proto_dotu(v9ses));
+
+	if ((v9ses->cache & CACHE_WRITEBACK) && (p9_omode & P9_OWRITE)) {
+		p9_omode = (p9_omode & ~P9_OWRITE) | P9_ORDWR;
+		p9_debug(P9_DEBUG_CACHE,
+			"write-only file with writeback enabled, creating w/ O_RDWR\n");
+	}
+	fid = v9fs_create(v9ses, dir, dentry, NULL, perm, p9_omode);
 	if (IS_ERR(fid)) {
 		err = PTR_ERR(fid);
 		goto error;
@@ -841,33 +835,18 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
 	v9fs_invalidate_inode_attr(dir);
 	inode = d_inode(dentry);
 	v9inode = V9FS_I(inode);
-	mutex_lock(&v9inode->v_mutex);
-	if ((v9ses->cache) && !v9inode->writeback_fid &&
-	    ((flags & O_ACCMODE) != O_RDONLY)) {
-		/*
-		 * clone a fid and add it to writeback_fid
-		 * we do it during open time instead of
-		 * page dirty time via write_begin/page_mkwrite
-		 * because we want write after unlink usecase
-		 * to work.
-		 */
-		inode_fid = v9fs_writeback_fid(dentry);
-		if (IS_ERR(inode_fid)) {
-			err = PTR_ERR(inode_fid);
-			mutex_unlock(&v9inode->v_mutex);
-			goto error;
-		}
-		v9inode->writeback_fid = (void *) inode_fid;
-	}
-	mutex_unlock(&v9inode->v_mutex);
 	err = finish_open(file, dentry, generic_file_open);
 	if (err)
 		goto error;
 
 	file->private_data = fid;
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+#ifdef CONFIG_9P_FSCACHE
+	if (v9ses->cache & CACHE_FSCACHE)
 		fscache_use_cookie(v9fs_inode_cookie(v9inode),
 				   file->f_mode & FMODE_WRITE);
+#endif
+
+	v9fs_fid_add_modes(fid, v9ses->flags, v9ses->cache, file->f_flags);
 	v9fs_open_fid_add(inode, &fid);
 
 	file->f_mode |= FMODE_CREATED;
@@ -1029,15 +1008,24 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct path *path,
 		 struct kstat *stat, u32 request_mask, unsigned int flags)
 {
 	struct dentry *dentry = path->dentry;
+	struct inode *inode = d_inode(dentry);
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid;
 	struct p9_wstat *st;
 
 	p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
 	v9ses = v9fs_dentry2v9ses(dentry);
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-		generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat);
+	if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
+		generic_fillattr(&nop_mnt_idmap, inode, stat);
 		return 0;
+	} else if (v9ses->cache & CACHE_WRITEBACK) {
+		if (S_ISREG(inode->i_mode)) {
+			int retval = filemap_fdatawrite(inode->i_mapping);
+
+			if (retval)
+				p9_debug(P9_DEBUG_ERROR,
+				    "flushing writeback during getattr returned %d\n", retval);
+		}
 	}
 	fid = v9fs_fid_lookup(dentry);
 	if (IS_ERR(fid))
@@ -1069,7 +1057,6 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap,
 {
 	int retval, use_dentry = 0;
 	struct inode *inode = d_inode(dentry);
-	struct v9fs_inode *v9inode = V9FS_I(inode);
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid = NULL;
 	struct p9_wstat wstat;
@@ -1114,8 +1101,12 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap,
 	}
 
 	/* Write all dirty data */
-	if (d_is_reg(dentry))
-		filemap_write_and_wait(inode->i_mapping);
+	if (d_is_reg(dentry)) {
+		retval = filemap_fdatawrite(inode->i_mapping);
+		if (retval)
+			p9_debug(P9_DEBUG_ERROR,
+			    "flushing writeback during setattr returned %d\n", retval);
+	}
 
 	retval = p9_client_wstat(fid, &wstat);
 
@@ -1126,9 +1117,17 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap,
 		return retval;
 
 	if ((iattr->ia_valid & ATTR_SIZE) &&
-	    iattr->ia_size != i_size_read(inode)) {
+		 iattr->ia_size != i_size_read(inode)) {
 		truncate_setsize(inode, iattr->ia_size);
-		fscache_resize_cookie(v9fs_inode_cookie(v9inode), iattr->ia_size);
+		truncate_pagecache(inode, iattr->ia_size);
+
+#ifdef CONFIG_9P_FSCACHE
+		if (v9ses->cache & CACHE_FSCACHE) {
+			struct v9fs_inode *v9inode = V9FS_I(inode);
+
+			fscache_resize_cookie(v9fs_inode_cookie(v9inode), iattr->ia_size);
+		}
+#endif
 	}
 
 	v9fs_invalidate_inode_attr(inode);
@@ -1412,7 +1411,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
 	 * We don't want to refresh inode->i_size,
 	 * because we may have cached data
 	 */
-	flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ?
+	flags = (v9ses->cache & CACHE_LOOSE) ?
 		V9FS_STAT2INODE_KEEP_ISIZE : 0;
 	v9fs_stat2inode(st, inode, inode->i_sb, flags);
 out:
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index a7da49906d99..5361cd2d7996 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -231,12 +231,12 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
 	int err = 0;
 	kgid_t gid;
 	umode_t mode;
+	int p9_omode = v9fs_open_to_dotl_flags(flags);
 	const unsigned char *name = NULL;
 	struct p9_qid qid;
 	struct inode *inode;
 	struct p9_fid *fid = NULL;
-	struct v9fs_inode *v9inode;
-	struct p9_fid *dfid = NULL, *ofid = NULL, *inode_fid = NULL;
+	struct p9_fid *dfid = NULL, *ofid = NULL;
 	struct v9fs_session_info *v9ses;
 	struct posix_acl *pacl = NULL, *dacl = NULL;
 	struct dentry *res = NULL;
@@ -281,14 +281,19 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
 	/* Update mode based on ACL value */
 	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
 	if (err) {
-		p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n",
+		p9_debug(P9_DEBUG_VFS, "Failed to get acl values in create %d\n",
 			 err);
 		goto out;
 	}
-	err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags),
-				    mode, gid, &qid);
+
+	if ((v9ses->cache & CACHE_WRITEBACK) && (p9_omode & P9_OWRITE)) {
+		p9_omode = (p9_omode & ~P9_OWRITE) | P9_ORDWR;
+		p9_debug(P9_DEBUG_CACHE,
+			"write-only file with writeback enabled, creating w/ O_RDWR\n");
+	}
+	err = p9_client_create_dotl(ofid, name, p9_omode, mode, gid, &qid);
 	if (err < 0) {
-		p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n",
+		p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in create %d\n",
 			 err);
 		goto out;
 	}
@@ -313,36 +318,19 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
 	v9fs_fid_add(dentry, &fid);
 	d_instantiate(dentry, inode);
 
-	v9inode = V9FS_I(inode);
-	mutex_lock(&v9inode->v_mutex);
-	if ((v9ses->cache) && !v9inode->writeback_fid &&
-	    ((flags & O_ACCMODE) != O_RDONLY)) {
-		/*
-		 * clone a fid and add it to writeback_fid
-		 * we do it during open time instead of
-		 * page dirty time via write_begin/page_mkwrite
-		 * because we want write after unlink usecase
-		 * to work.
-		 */
-		inode_fid = v9fs_writeback_fid(dentry);
-		if (IS_ERR(inode_fid)) {
-			err = PTR_ERR(inode_fid);
-			mutex_unlock(&v9inode->v_mutex);
-			goto out;
-		}
-		v9inode->writeback_fid = (void *) inode_fid;
-	}
-	mutex_unlock(&v9inode->v_mutex);
 	/* Since we are opening a file, assign the open fid to the file */
 	err = finish_open(file, dentry, generic_file_open);
 	if (err)
 		goto out;
 	file->private_data = ofid;
 #ifdef CONFIG_9P_FSCACHE
-	if (v9ses->cache == CACHE_FSCACHE)
+	if (v9ses->cache & CACHE_FSCACHE) {
+		struct v9fs_inode *v9inode = V9FS_I(inode);
 		fscache_use_cookie(v9fs_inode_cookie(v9inode),
 				   file->f_mode & FMODE_WRITE);
+	}
 #endif
+	v9fs_fid_add_modes(ofid, v9ses->flags, v9ses->cache, flags);
 	v9fs_open_fid_add(inode, &ofid);
 	file->f_mode |= FMODE_CREATED;
 out:
@@ -414,7 +402,7 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
 	}
 
 	/* instantiate inode and assign the unopened fid to the dentry */
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+	if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
 		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
@@ -457,13 +445,22 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap,
 	struct dentry *dentry = path->dentry;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid;
+	struct inode *inode = d_inode(dentry);
 	struct p9_stat_dotl *st;
 
 	p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
 	v9ses = v9fs_dentry2v9ses(dentry);
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-		generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat);
+	if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
+		generic_fillattr(&nop_mnt_idmap, inode, stat);
 		return 0;
+	} else if (v9ses->cache) {
+		if (S_ISREG(inode->i_mode)) {
+			int retval = filemap_fdatawrite(inode->i_mapping);
+
+			if (retval)
+				p9_debug(P9_DEBUG_ERROR,
+				    "flushing writeback during getattr returned %d\n", retval);
+		}
 	}
 	fid = v9fs_fid_lookup(dentry);
 	if (IS_ERR(fid))
@@ -539,12 +536,13 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap,
 			  struct dentry *dentry, struct iattr *iattr)
 {
 	int retval, use_dentry = 0;
+	struct inode *inode = d_inode(dentry);
+	struct v9fs_session_info __maybe_unused *v9ses;
 	struct p9_fid *fid = NULL;
 	struct p9_iattr_dotl p9attr = {
 		.uid = INVALID_UID,
 		.gid = INVALID_GID,
 	};
-	struct inode *inode = d_inode(dentry);
 
 	p9_debug(P9_DEBUG_VFS, "\n");
 
@@ -552,6 +550,8 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap,
 	if (retval)
 		return retval;
 
+	v9ses = v9fs_dentry2v9ses(dentry);
+
 	p9attr.valid = v9fs_mapped_iattr_valid(iattr->ia_valid);
 	if (iattr->ia_valid & ATTR_MODE)
 		p9attr.mode = iattr->ia_mode;
@@ -582,8 +582,12 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap,
 		return PTR_ERR(fid);
 
 	/* Write all dirty data */
-	if (S_ISREG(inode->i_mode))
-		filemap_write_and_wait(inode->i_mapping);
+	if (S_ISREG(inode->i_mode)) {
+		retval = filemap_fdatawrite(inode->i_mapping);
+		if (retval < 0)
+			p9_debug(P9_DEBUG_ERROR,
+			    "Flushing file prior to setattr failed: %d\n", retval);
+	}
 
 	retval = p9_client_setattr(fid, &p9attr);
 	if (retval < 0) {
@@ -592,9 +596,17 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap,
 		return retval;
 	}
 
-	if ((iattr->ia_valid & ATTR_SIZE) &&
-	    iattr->ia_size != i_size_read(inode))
+	if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size !=
+		 i_size_read(inode)) {
 		truncate_setsize(inode, iattr->ia_size);
+		truncate_pagecache(inode, iattr->ia_size);
+
+#ifdef CONFIG_9P_FSCACHE
+		if (v9ses->cache & CACHE_FSCACHE)
+			fscache_resize_cookie(v9fs_inode_cookie(V9FS_I(inode)),
+				iattr->ia_size);
+#endif
+	}
 
 	v9fs_invalidate_inode_attr(inode);
 	setattr_copy(&nop_mnt_idmap, inode, iattr);
@@ -721,7 +733,7 @@ v9fs_vfs_symlink_dotl(struct mnt_idmap *idmap, struct inode *dir,
 	}
 
 	v9fs_invalidate_inode_attr(dir);
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+	if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
 		/* Now walk from the parent so we can get an unopened fid. */
 		fid = p9_client_walk(dfid, 1, &name, 1);
 		if (IS_ERR(fid)) {
@@ -798,7 +810,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
 	}
 
 	v9fs_invalidate_inode_attr(dir);
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+	if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
 		/* Get the latest stat info from server. */
 		struct p9_fid *fid;
 
@@ -875,7 +887,7 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir,
 	}
 
 	/* instantiate inode and assign the unopened fid to the dentry */
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+	if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
 		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
@@ -960,7 +972,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
 	 * We don't want to refresh inode->i_size,
 	 * because we may have cached data
 	 */
-	flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ?
+	flags = (v9ses->cache & CACHE_LOOSE) ?
 		V9FS_STAT2INODE_KEEP_ISIZE : 0;
 	v9fs_stat2inode_dotl(st, inode, flags);
 out:
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 10449994a972..73db55c050bf 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -63,7 +63,8 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 	sb->s_magic = V9FS_MAGIC;
 	if (v9fs_proto_dotl(v9ses)) {
 		sb->s_op = &v9fs_super_ops_dotl;
-		sb->s_xattr = v9fs_xattr_handlers;
+		if (!(v9ses->flags & V9FS_NO_XATTR))
+			sb->s_xattr = v9fs_xattr_handlers;
 	} else {
 		sb->s_op = &v9fs_super_ops;
 		sb->s_time_max = U32_MAX;
@@ -83,9 +84,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 		sb->s_bdi->io_pages = v9ses->maxdata >> PAGE_SHIFT;
 	}
 
-	sb->s_flags |= SB_ACTIVE | SB_DIRSYNC;
-	if (!v9ses->cache)
-		sb->s_flags |= SB_SYNCHRONOUS;
+	sb->s_flags |= SB_ACTIVE;
 
 #ifdef CONFIG_9P_FS_POSIX_ACL
 	if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL)
@@ -136,7 +135,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 	if (retval)
 		goto release_sb;
 
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+	if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
 		sb->s_d_op = &v9fs_cached_dentry_operations;
 	else
 		sb->s_d_op = &v9fs_dentry_operations;
@@ -277,7 +276,7 @@ static int v9fs_drop_inode(struct inode *inode)
 	struct v9fs_session_info *v9ses;
 
 	v9ses = v9fs_inode2v9ses(inode);
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+	if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
 		return generic_drop_inode(inode);
 	/*
 	 * in case of non cached mode always drop the
@@ -290,49 +289,30 @@ static int v9fs_drop_inode(struct inode *inode)
 static int v9fs_write_inode(struct inode *inode,
 			    struct writeback_control *wbc)
 {
-	int ret;
-	struct p9_wstat wstat;
 	struct v9fs_inode *v9inode;
+
 	/*
 	 * send an fsync request to server irrespective of
 	 * wbc->sync_mode.
 	 */
 	p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
-	v9inode = V9FS_I(inode);
-	if (!v9inode->writeback_fid)
-		return 0;
-	v9fs_blank_wstat(&wstat);
 
-	ret = p9_client_wstat(v9inode->writeback_fid, &wstat);
-	if (ret < 0) {
-		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-		return ret;
-	}
+	v9inode = V9FS_I(inode);
 	fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode));
+
 	return 0;
 }
 
 static int v9fs_write_inode_dotl(struct inode *inode,
 				 struct writeback_control *wbc)
 {
-	int ret;
 	struct v9fs_inode *v9inode;
-	/*
-	 * send an fsync request to server irrespective of
-	 * wbc->sync_mode.
-	 */
+
 	v9inode = V9FS_I(inode);
-	p9_debug(P9_DEBUG_VFS, "%s: inode %p, writeback_fid %p\n",
-		 __func__, inode, v9inode->writeback_fid);
-	if (!v9inode->writeback_fid)
-		return 0;
-
-	ret = p9_client_fsync(v9inode->writeback_fid, 0);
-	if (ret < 0) {
-		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-		return ret;
-	}
+	p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
+
 	fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode));
+
 	return 0;
 }
 
diff --git a/fs/Kconfig b/fs/Kconfig
index e99830c65033..cc07a0cd3172 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -250,16 +250,9 @@ config HUGETLBFS
 config HUGETLB_PAGE
 	def_bool HUGETLBFS
 
-#
-# Select this config option from the architecture Kconfig, if it is preferred
-# to enable the feature of HugeTLB Vmemmap Optimization (HVO).
-#
-config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
-	bool
-
 config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	def_bool HUGETLB_PAGE
-	depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+	depends on ARCH_WANT_OPTIMIZE_VMEMMAP
 	depends on SPARSEMEM_VMEMMAP
 
 config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
diff --git a/fs/Makefile b/fs/Makefile
index 05f89b5c962f..834f1c3dba46 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -6,7 +6,6 @@
 # Rewritten to use lists instead of if-statements.
 # 
 
-obj-$(CONFIG_SYSCTL)		+= sysctls.o
 
 obj-y :=	open.o read_write.o file_table.o super.o \
 		char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
@@ -50,7 +49,7 @@ obj-$(CONFIG_FS_MBCACHE)	+= mbcache.o
 obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o
 obj-$(CONFIG_NFS_COMMON)	+= nfs_common/
 obj-$(CONFIG_COREDUMP)		+= coredump.o
-obj-$(CONFIG_SYSCTL)		+= drop_caches.o
+obj-$(CONFIG_SYSCTL)		+= drop_caches.o sysctls.o
 
 obj-$(CONFIG_FHANDLE)		+= fhandle.o
 obj-y				+= iomap/
@@ -124,7 +123,7 @@ obj-$(CONFIG_9P_FS)		+= 9p/
 obj-$(CONFIG_AFS_FS)		+= afs/
 obj-$(CONFIG_NILFS2_FS)		+= nilfs2/
 obj-$(CONFIG_BEFS_FS)		+= befs/
-obj-$(CONFIG_HOSTFS)		+= hostfs/
+obj-y				+= hostfs/
 obj-$(CONFIG_CACHEFILES)	+= cachefiles/
 obj-$(CONFIG_DEBUG_FS)		+= debugfs/
 obj-$(CONFIG_TRACING)		+= tracefs/
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 82690d1dd49a..4dd97afa536c 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -275,6 +275,7 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
 	loff_t i_size;
 	int nr_pages, i;
 	int ret;
+	loff_t remote_size = 0;
 
 	_enter("");
 
@@ -289,6 +290,8 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
 
 expand:
 	i_size = i_size_read(&dvnode->netfs.inode);
+	if (i_size < remote_size)
+	    i_size = remote_size;
 	if (i_size < 2048) {
 		ret = afs_bad(dvnode, afs_file_error_dir_small);
 		goto error;
@@ -319,16 +322,16 @@ expand:
 		struct folio *folio;
 
 		folio = filemap_get_folio(mapping, i);
-		if (!folio) {
+		if (IS_ERR(folio)) {
 			if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
 				afs_stat_v(dvnode, n_inval);
-
-			ret = -ENOMEM;
 			folio = __filemap_get_folio(mapping,
 						    i, FGP_LOCK | FGP_CREAT,
 						    mapping->gfp_mask);
-			if (!folio)
+			if (IS_ERR(folio)) {
+				ret = PTR_ERR(folio);
 				goto error;
+			}
 			folio_attach_private(folio, (void *)1);
 			folio_unlock(folio);
 		}
@@ -364,6 +367,7 @@ expand:
 			 * buffer.
 			 */
 			up_write(&dvnode->validate_lock);
+			remote_size = req->file_size;
 			goto expand;
 		}
 
@@ -524,7 +528,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
 		 */
 		folio = __filemap_get_folio(dir->i_mapping, ctx->pos / PAGE_SIZE,
 					    FGP_ACCESSED, 0);
-		if (!folio) {
+		if (IS_ERR(folio)) {
 			ret = afs_bad(dvnode, afs_file_error_dir_missing_page);
 			break;
 		}
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index 0ab7752d1b75..f0eddccbdd95 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -115,7 +115,7 @@ static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index)
 	folio = __filemap_get_folio(mapping, index,
 				    FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
 				    mapping->gfp_mask);
-	if (!folio)
+	if (IS_ERR(folio))
 		clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
 	else if (folio && !folio_test_private(folio))
 		folio_attach_private(folio, (void *)1);
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 68d6d5dc608d..719b31374879 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -569,20 +569,10 @@ static void afs_vm_close(struct vm_area_struct *vma)
 static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff)
 {
 	struct afs_vnode *vnode = AFS_FS_I(file_inode(vmf->vma->vm_file));
-	struct afs_file *af = vmf->vma->vm_file->private_data;
 
-	switch (afs_validate(vnode, af->key)) {
-	case 0:
+	if (afs_pagecache_valid(vnode))
 		return filemap_map_pages(vmf, start_pgoff, end_pgoff);
-	case -ENOMEM:
-		return VM_FAULT_OOM;
-	case -EINTR:
-	case -ERESTARTSYS:
-		return VM_FAULT_RETRY;
-	case -ESTALE:
-	default:
-		return VM_FAULT_SIGBUS;
-	}
+	return 0;
 }
 
 static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 0167e96e5198..866bab860a88 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -230,6 +230,7 @@ static void afs_apply_status(struct afs_operation *op,
 			set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
 		}
 		change_size = true;
+		data_changed = true;
 	} else if (vnode->status.type == AFS_FTYPE_DIR) {
 		/* Expected directory change is handled elsewhere so
 		 * that we can locally edit the directory and save on a
@@ -449,7 +450,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)
 				    0 : FSCACHE_ADV_SINGLE_CHUNK,
 				    &key, sizeof(key),
 				    &aux, sizeof(aux),
-				    vnode->status.size));
+				    i_size_read(&vnode->netfs.inode)));
 #endif
 }
 
@@ -668,6 +669,24 @@ bool afs_check_validity(struct afs_vnode *vnode)
 }
 
 /*
+ * Returns true if the pagecache is still valid.  Does not sleep.
+ */
+bool afs_pagecache_valid(struct afs_vnode *vnode)
+{
+	if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) {
+		if (vnode->netfs.inode.i_nlink)
+			clear_nlink(&vnode->netfs.inode);
+		return true;
+	}
+
+	if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) &&
+	    afs_check_validity(vnode))
+		return true;
+
+	return false;
+}
+
+/*
  * validate a vnode/inode
  * - there are several things we need to check
  *   - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
@@ -684,14 +703,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
 	       vnode->fid.vid, vnode->fid.vnode, vnode->flags,
 	       key_serial(key));
 
-	if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) {
-		if (vnode->netfs.inode.i_nlink)
-			clear_nlink(&vnode->netfs.inode);
-		goto valid;
-	}
-
-	if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) &&
-	    afs_check_validity(vnode))
+	if (afs_pagecache_valid(vnode))
 		goto valid;
 
 	down_write(&vnode->validate_lock);
@@ -765,6 +777,13 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path,
 		if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) &&
 		    stat->nlink > 0)
 			stat->nlink -= 1;
+
+		/* Lie about the size of directories.  We maintain a locally
+		 * edited copy and may make different allocation decisions on
+		 * it, but we need to give userspace the server's size.
+		 */
+		if (S_ISDIR(inode->i_mode))
+			stat->size = vnode->netfs.remote_i_size;
 	} while (need_seqretry(&vnode->cb_lock, seq));
 
 	done_seqretry(&vnode->cb_lock, seq);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 68ae91d21b57..9d3d64921106 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1171,6 +1171,7 @@ extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *);
 extern struct inode *afs_root_iget(struct super_block *, struct key *);
 extern bool afs_check_validity(struct afs_vnode *);
 extern int afs_validate(struct afs_vnode *, struct key *);
+bool afs_pagecache_valid(struct afs_vnode *);
 extern int afs_getattr(struct mnt_idmap *idmap, const struct path *,
 		       struct kstat *, u32, unsigned int);
 extern int afs_setattr(struct mnt_idmap *idmap, struct dentry *, struct iattr *);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 571f3b9a417e..c822d6006033 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -232,7 +232,7 @@ static void afs_kill_pages(struct address_space *mapping,
 		_debug("kill %lx (to %lx)", index, last);
 
 		folio = filemap_get_folio(mapping, index);
-		if (!folio) {
+		if (IS_ERR(folio)) {
 			next = index + 1;
 			continue;
 		}
@@ -270,7 +270,7 @@ static void afs_redirty_pages(struct writeback_control *wbc,
 		_debug("redirty %llx @%llx", len, start);
 
 		folio = filemap_get_folio(mapping, index);
-		if (!folio) {
+		if (IS_ERR(folio)) {
 			next = index + 1;
 			continue;
 		}
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 8a884e795f6a..1033fbdfdbec 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2058,7 +2058,7 @@ static int elf_core_dump(struct coredump_params *cprm)
 
 	has_dumped = 1;
 
-	offset += sizeof(elf);				/* Elf header */
+	offset += sizeof(elf);				/* ELF header */
 	offset += segs * sizeof(struct elf_phdr);	/* Program headers */
 
 	/* Write notes phdr entry */
@@ -2174,7 +2174,6 @@ static void __exit exit_elf_binfmt(void)
 
 core_initcall(init_elf_binfmt);
 module_exit(exit_elf_binfmt);
-MODULE_LICENSE("GPL");
 
 #ifdef CONFIG_BINFMT_ELF_KUNIT_TEST
 #include "binfmt_elf_test.c"
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index a05eafcacfb2..05a1471d5283 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1540,7 +1540,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 	fill_note(&auxv_note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv);
 	thread_status_size += notesize(&auxv_note);
 
-	offset = sizeof(*elf);				/* Elf header */
+	offset = sizeof(*elf);				/* ELF header */
 	offset += segs * sizeof(struct elf_phdr);	/* Program headers */
 
 	/* Write notes phdr entry */
diff --git a/fs/buffer.c b/fs/buffer.c
index b3eb905f87d6..a7fc561758b1 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -842,7 +842,7 @@ int remove_inode_buffers(struct inode *inode)
 }
 
 /*
- * Create the appropriate buffers when given a page for data area and
+ * Create the appropriate buffers when given a folio for data area and
  * the size of each buffer.. Use the bh->b_this_page linked list to
  * follow the buffers created.  Return NULL if unable to create more
  * buffers.
@@ -850,8 +850,8 @@ int remove_inode_buffers(struct inode *inode)
  * The retry flag is used to differentiate async IO (paging, swapping)
  * which may not fail from ordinary buffer allocations.
  */
-struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
-		bool retry)
+struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
+					bool retry)
 {
 	struct buffer_head *bh, *head;
 	gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
@@ -861,12 +861,12 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 	if (retry)
 		gfp |= __GFP_NOFAIL;
 
-	/* The page lock pins the memcg */
-	memcg = page_memcg(page);
+	/* The folio lock pins the memcg */
+	memcg = folio_memcg(folio);
 	old_memcg = set_active_memcg(memcg);
 
 	head = NULL;
-	offset = PAGE_SIZE;
+	offset = folio_size(folio);
 	while ((offset -= size) >= 0) {
 		bh = alloc_buffer_head(gfp);
 		if (!bh)
@@ -878,8 +878,8 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 
 		bh->b_size = size;
 
-		/* Link the buffer to its page */
-		set_bh_page(bh, page, offset);
+		/* Link the buffer to its folio */
+		folio_set_bh(bh, folio, offset);
 	}
 out:
 	set_active_memcg(old_memcg);
@@ -898,6 +898,13 @@ no_grow:
 
 	goto out;
 }
+EXPORT_SYMBOL_GPL(folio_alloc_buffers);
+
+struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
+				       bool retry)
+{
+	return folio_alloc_buffers(page_folio(page), size, retry);
+}
 EXPORT_SYMBOL_GPL(alloc_page_buffers);
 
 static inline void
@@ -1484,6 +1491,21 @@ void set_bh_page(struct buffer_head *bh,
 }
 EXPORT_SYMBOL(set_bh_page);
 
+void folio_set_bh(struct buffer_head *bh, struct folio *folio,
+		  unsigned long offset)
+{
+	bh->b_folio = folio;
+	BUG_ON(offset >= folio_size(folio));
+	if (folio_test_highmem(folio))
+		/*
+		 * This catches illegal uses and preserves the offset:
+		 */
+		bh->b_data = (char *)(0 + offset);
+	else
+		bh->b_data = folio_address(folio) + offset;
+}
+EXPORT_SYMBOL(folio_set_bh);
+
 /*
  * Called when truncating a buffer on a page completely.
  */
@@ -1571,18 +1593,17 @@ out:
 }
 EXPORT_SYMBOL(block_invalidate_folio);
 
-
 /*
  * We attach and possibly dirty the buffers atomically wrt
  * block_dirty_folio() via private_lock.  try_to_free_buffers
- * is already excluded via the page lock.
+ * is already excluded via the folio lock.
  */
-void create_empty_buffers(struct page *page,
-			unsigned long blocksize, unsigned long b_state)
+void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
+				unsigned long b_state)
 {
 	struct buffer_head *bh, *head, *tail;
 
-	head = alloc_page_buffers(page, blocksize, true);
+	head = folio_alloc_buffers(folio, blocksize, true);
 	bh = head;
 	do {
 		bh->b_state |= b_state;
@@ -1591,19 +1612,26 @@ void create_empty_buffers(struct page *page,
 	} while (bh);
 	tail->b_this_page = head;
 
-	spin_lock(&page->mapping->private_lock);
-	if (PageUptodate(page) || PageDirty(page)) {
+	spin_lock(&folio->mapping->private_lock);
+	if (folio_test_uptodate(folio) || folio_test_dirty(folio)) {
 		bh = head;
 		do {
-			if (PageDirty(page))
+			if (folio_test_dirty(folio))
 				set_buffer_dirty(bh);
-			if (PageUptodate(page))
+			if (folio_test_uptodate(folio))
 				set_buffer_uptodate(bh);
 			bh = bh->b_this_page;
 		} while (bh != head);
 	}
-	attach_page_private(page, head);
-	spin_unlock(&page->mapping->private_lock);
+	folio_attach_private(folio, head);
+	spin_unlock(&folio->mapping->private_lock);
+}
+EXPORT_SYMBOL(folio_create_empty_buffers);
+
+void create_empty_buffers(struct page *page,
+			unsigned long blocksize, unsigned long b_state)
+{
+	folio_create_empty_buffers(page_folio(page), blocksize, b_state);
 }
 EXPORT_SYMBOL(create_empty_buffers);
 
@@ -1694,14 +1722,17 @@ static inline int block_size_bits(unsigned int blocksize)
 	return ilog2(blocksize);
 }
 
-static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
+static struct buffer_head *folio_create_buffers(struct folio *folio,
+						struct inode *inode,
+						unsigned int b_state)
 {
-	BUG_ON(!PageLocked(page));
+	BUG_ON(!folio_test_locked(folio));
 
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, 1 << READ_ONCE(inode->i_blkbits),
-				     b_state);
-	return page_buffers(page);
+	if (!folio_buffers(folio))
+		folio_create_empty_buffers(folio,
+					   1 << READ_ONCE(inode->i_blkbits),
+					   b_state);
+	return folio_buffers(folio);
 }
 
 /*
@@ -1745,8 +1776,8 @@ int __block_write_full_page(struct inode *inode, struct page *page,
 	int nr_underway = 0;
 	blk_opf_t write_flags = wbc_to_write_flags(wbc);
 
-	head = create_page_buffers(page, inode,
-					(1 << BH_Dirty)|(1 << BH_Uptodate));
+	head = folio_create_buffers(page_folio(page), inode,
+				    (1 << BH_Dirty) | (1 << BH_Uptodate));
 
 	/*
 	 * Be very careful.  We have no exclusion from block_dirty_folio
@@ -2009,7 +2040,7 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
 	BUG_ON(to > PAGE_SIZE);
 	BUG_ON(from > to);
 
-	head = create_page_buffers(&folio->page, inode, 0);
+	head = folio_create_buffers(folio, inode, 0);
 	blocksize = head->b_size;
 	bbits = block_size_bits(blocksize);
 
@@ -2295,7 +2326,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
 
 	VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
 
-	head = create_page_buffers(&folio->page, inode, 0);
+	head = folio_create_buffers(folio, inode, 0);
 	blocksize = head->b_size;
 	bbits = block_size_bits(blocksize);
 
diff --git a/fs/cachefiles/error_inject.c b/fs/cachefiles/error_inject.c
index 58f8aec964e4..18de8a876b02 100644
--- a/fs/cachefiles/error_inject.c
+++ b/fs/cachefiles/error_inject.c
@@ -22,18 +22,9 @@ static struct ctl_table cachefiles_sysctls[] = {
 	{}
 };
 
-static struct ctl_table cachefiles_sysctls_root[] = {
-	{
-		.procname	= "cachefiles",
-		.mode		= 0555,
-		.child		= cachefiles_sysctls,
-	},
-	{}
-};
-
 int __init cachefiles_register_error_injection(void)
 {
-	cachefiles_sysctl = register_sysctl_table(cachefiles_sysctls_root);
+	cachefiles_sysctl = register_sysctl("cachefiles", cachefiles_sysctls);
 	if (!cachefiles_sysctl)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index d5335f445233..6bb251a4d613 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -808,6 +808,7 @@ static int ceph_writepages_start(struct address_space *mapping,
 	bool should_loop, range_whole = false;
 	bool done = false;
 	bool caching = ceph_is_cache_enabled(inode);
+	xa_mark_t tag;
 
 	if (wbc->sync_mode == WB_SYNC_NONE &&
 	    fsc->write_congested)
@@ -834,6 +835,11 @@ static int ceph_writepages_start(struct address_space *mapping,
 	start_index = wbc->range_cyclic ? mapping->writeback_index : 0;
 	index = start_index;
 
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) {
+		tag = PAGECACHE_TAG_TOWRITE;
+	} else {
+		tag = PAGECACHE_TAG_DIRTY;
+	}
 retry:
 	/* find oldest snap context with dirty data */
 	snapc = get_oldest_context(inode, &ceph_wbc, NULL);
@@ -872,6 +878,9 @@ retry:
 		dout(" non-head snapc, range whole\n");
 	}
 
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+		tag_pages_for_writeback(mapping, index, end);
+
 	ceph_put_snap_context(last_snapc);
 	last_snapc = snapc;
 
@@ -888,7 +897,7 @@ retry:
 
 get_more_pages:
 		nr_folios = filemap_get_folios_tag(mapping, &index,
-				end, PAGECACHE_TAG_DIRTY, &fbatch);
+						   end, tag, &fbatch);
 		dout("pagevec_lookup_range_tag got %d\n", nr_folios);
 		if (!nr_folios && !locked_pages)
 			break;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7cc20772eac9..789be30d6ee2 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -431,7 +431,7 @@ void ceph_reservation_status(struct ceph_fs_client *fsc,
  *
  * Called with i_ceph_lock held.
  */
-static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
+struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
 {
 	struct ceph_cap *cap;
 	struct rb_node *n = ci->i_caps.rb_node;
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index bec3c4549c07..3904333fa6c3 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -248,14 +248,20 @@ static int metrics_caps_show(struct seq_file *s, void *p)
 	return 0;
 }
 
-static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
+static int caps_show_cb(struct inode *inode, int mds, void *p)
 {
+	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct seq_file *s = p;
-
-	seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode),
-		   cap->session->s_mds,
-		   ceph_cap_string(cap->issued),
-		   ceph_cap_string(cap->implemented));
+	struct ceph_cap *cap;
+
+	spin_lock(&ci->i_ceph_lock);
+	cap = __get_cap_for_mds(ci, mds);
+	if (cap)
+		seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode),
+			   cap->session->s_mds,
+			   ceph_cap_string(cap->issued),
+			   ceph_cap_string(cap->implemented));
+	spin_unlock(&ci->i_ceph_lock);
 	return 0;
 }
 
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 0ced8b570e42..cb67ac821f0e 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1050,6 +1050,9 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
 	struct ceph_mds_request *req;
 	int err;
 
+	if (dentry->d_flags & DCACHE_DISCONNECTED)
+		return -EINVAL;
+
 	err = ceph_wait_on_conflict_unlink(dentry);
 	if (err)
 		return err;
@@ -1057,8 +1060,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
 	if (ceph_snap(dir) != CEPH_NOSNAP)
 		return -EROFS;
 
-	dout("link in dir %p old_dentry %p dentry %p\n", dir,
-	     old_dentry, dentry);
+	dout("link in dir %p %llx.%llx old_dentry %p:'%pd' dentry %p:'%pd'\n",
+	     dir, ceph_vinop(dir), old_dentry, old_dentry, dentry, dentry);
 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
 	if (IS_ERR(req)) {
 		d_drop(dentry);
@@ -1067,6 +1070,12 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
 	req->r_dentry = dget(dentry);
 	req->r_num_caps = 2;
 	req->r_old_dentry = dget(old_dentry);
+	/*
+	 * The old_dentry maybe a DCACHE_DISCONNECTED dentry, then we
+	 * will just pass the ino# to MDSs.
+	 */
+	if (old_dentry->d_flags & DCACHE_DISCONNECTED)
+		req->r_ino2 = ceph_vino(d_inode(old_dentry));
 	req->r_parent = dir;
 	ihold(dir);
 	set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 27a245d959c0..29cf00220b09 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1632,8 +1632,8 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
  * Caller must hold session s_mutex.
  */
 int ceph_iterate_session_caps(struct ceph_mds_session *session,
-			      int (*cb)(struct inode *, struct ceph_cap *,
-					void *), void *arg)
+			      int (*cb)(struct inode *, int mds, void *),
+			      void *arg)
 {
 	struct list_head *p;
 	struct ceph_cap *cap;
@@ -1645,6 +1645,8 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
 	spin_lock(&session->s_cap_lock);
 	p = session->s_caps.next;
 	while (p != &session->s_caps) {
+		int mds;
+
 		cap = list_entry(p, struct ceph_cap, session_caps);
 		inode = igrab(&cap->ci->netfs.inode);
 		if (!inode) {
@@ -1652,6 +1654,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
 			continue;
 		}
 		session->s_cap_iterator = cap;
+		mds = cap->mds;
 		spin_unlock(&session->s_cap_lock);
 
 		if (last_inode) {
@@ -1663,7 +1666,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
 			old_cap = NULL;
 		}
 
-		ret = cb(inode, cap, arg);
+		ret = cb(inode, mds, arg);
 		last_inode = inode;
 
 		spin_lock(&session->s_cap_lock);
@@ -1696,20 +1699,25 @@ out:
 	return ret;
 }
 
-static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
-				  void *arg)
+static int remove_session_caps_cb(struct inode *inode, int mds, void *arg)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	bool invalidate = false;
-	int iputs;
+	struct ceph_cap *cap;
+	int iputs = 0;
 
-	dout("removing cap %p, ci is %p, inode is %p\n",
-	     cap, ci, &ci->netfs.inode);
 	spin_lock(&ci->i_ceph_lock);
-	iputs = ceph_purge_inode_cap(inode, cap, &invalidate);
+	cap = __get_cap_for_mds(ci, mds);
+	if (cap) {
+		dout(" removing cap %p, ci is %p, inode is %p\n",
+		     cap, ci, &ci->netfs.inode);
+
+		iputs = ceph_purge_inode_cap(inode, cap, &invalidate);
+	}
 	spin_unlock(&ci->i_ceph_lock);
 
-	wake_up_all(&ci->i_cap_wq);
+	if (cap)
+		wake_up_all(&ci->i_cap_wq);
 	if (invalidate)
 		ceph_queue_invalidate(inode);
 	while (iputs--)
@@ -1780,8 +1788,7 @@ enum {
  *
  * caller must hold s_mutex.
  */
-static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
-			      void *arg)
+static int wake_up_session_cb(struct inode *inode, int mds, void *arg)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	unsigned long ev = (unsigned long)arg;
@@ -1792,12 +1799,14 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
 		ci->i_requested_max_size = 0;
 		spin_unlock(&ci->i_ceph_lock);
 	} else if (ev == RENEWCAPS) {
-		if (cap->cap_gen < atomic_read(&cap->session->s_cap_gen)) {
-			/* mds did not re-issue stale cap */
-			spin_lock(&ci->i_ceph_lock);
+		struct ceph_cap *cap;
+
+		spin_lock(&ci->i_ceph_lock);
+		cap = __get_cap_for_mds(ci, mds);
+		/* mds did not re-issue stale cap */
+		if (cap && cap->cap_gen < atomic_read(&cap->session->s_cap_gen))
 			cap->issued = cap->implemented = CEPH_CAP_PIN;
-			spin_unlock(&ci->i_ceph_lock);
-		}
+		spin_unlock(&ci->i_ceph_lock);
 	} else if (ev == FORCE_RO) {
 	}
 	wake_up_all(&ci->i_cap_wq);
@@ -1959,16 +1968,22 @@ out:
  * Yes, this is a bit sloppy.  Our only real goal here is to respond to
  * memory pressure from the MDS, though, so it needn't be perfect.
  */
-static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
+static int trim_caps_cb(struct inode *inode, int mds, void *arg)
 {
 	int *remaining = arg;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int used, wanted, oissued, mine;
+	struct ceph_cap *cap;
 
 	if (*remaining <= 0)
 		return -1;
 
 	spin_lock(&ci->i_ceph_lock);
+	cap = __get_cap_for_mds(ci, mds);
+	if (!cap) {
+		spin_unlock(&ci->i_ceph_lock);
+		return 0;
+	}
 	mine = cap->issued | cap->implemented;
 	used = __ceph_caps_used(ci);
 	wanted = __ceph_caps_file_wanted(ci);
@@ -2555,6 +2570,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	u64 ino1 = 0, ino2 = 0;
 	int pathlen1 = 0, pathlen2 = 0;
 	bool freepath1 = false, freepath2 = false;
+	struct dentry *old_dentry = NULL;
 	int len;
 	u16 releases;
 	void *p, *end;
@@ -2572,7 +2588,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	}
 
 	/* If r_old_dentry is set, then assume that its parent is locked */
-	ret = set_request_path_attr(NULL, req->r_old_dentry,
+	if (req->r_old_dentry &&
+	    !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED))
+		old_dentry = req->r_old_dentry;
+	ret = set_request_path_attr(NULL, old_dentry,
 			      req->r_old_dentry_dir,
 			      req->r_path2, req->r_ino2.ino,
 			      &path2, &pathlen2, &ino2, &freepath2, true);
@@ -3911,26 +3930,22 @@ out_unlock:
 /*
  * Encode information about a cap for a reconnect with the MDS.
  */
-static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
-			  void *arg)
+static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
 {
 	union {
 		struct ceph_mds_cap_reconnect v2;
 		struct ceph_mds_cap_reconnect_v1 v1;
 	} rec;
-	struct ceph_inode_info *ci = cap->ci;
+	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_reconnect_state *recon_state = arg;
 	struct ceph_pagelist *pagelist = recon_state->pagelist;
 	struct dentry *dentry;
+	struct ceph_cap *cap;
 	char *path;
-	int pathlen = 0, err;
+	int pathlen = 0, err = 0;
 	u64 pathbase;
 	u64 snap_follows;
 
-	dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
-	     inode, ceph_vinop(inode), cap, cap->cap_id,
-	     ceph_cap_string(cap->issued));
-
 	dentry = d_find_primary(inode);
 	if (dentry) {
 		/* set pathbase to parent dir when msg_version >= 2 */
@@ -3947,6 +3962,15 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
 	}
 
 	spin_lock(&ci->i_ceph_lock);
+	cap = __get_cap_for_mds(ci, mds);
+	if (!cap) {
+		spin_unlock(&ci->i_ceph_lock);
+		goto out_err;
+	}
+	dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
+	     inode, ceph_vinop(inode), cap, cap->cap_id,
+	     ceph_cap_string(cap->issued));
+
 	cap->seq = 0;        /* reset cap seq */
 	cap->issue_seq = 0;  /* and issue_seq */
 	cap->mseq = 0;       /* and migrate_seq */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 0598faa50e2e..724307ff89cd 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -355,8 +355,8 @@ struct ceph_snapid_map {
 	struct rb_node node;
 	struct list_head lru;
 	atomic_t ref;
-	u64 snap;
 	dev_t dev;
+	u64 snap;
 	unsigned long last_used;
 };
 
@@ -541,8 +541,7 @@ extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
 extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc);
 extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr);
 extern int ceph_iterate_session_caps(struct ceph_mds_session *session,
-				     int (*cb)(struct inode *,
-					       struct ceph_cap *, void *),
+				     int (*cb)(struct inode *, int mds, void *),
 				     void *arg);
 extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 6ecca2c6d137..d24bf0db5234 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1192,6 +1192,8 @@ extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
 				    struct ceph_mds_session *session);
 void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
 				   struct ceph_inode_info *ci);
+extern struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci,
+					  int mds);
 extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
 					     int mds);
 extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 1fe1b62abebd..806183959c47 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -535,6 +535,8 @@ static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,
 	return NULL;
 }
 
+#define MAX_XATTR_VAL_PRINT_LEN 256
+
 static int __set_xattr(struct ceph_inode_info *ci,
 			   const char *name, int name_len,
 			   const char *val, int val_len,
@@ -597,7 +599,7 @@ static int __set_xattr(struct ceph_inode_info *ci,
 		xattr->should_free_name = update_xattr;
 
 		ci->i_xattrs.count++;
-		dout("__set_xattr count=%d\n", ci->i_xattrs.count);
+		dout("%s count=%d\n", __func__, ci->i_xattrs.count);
 	} else {
 		kfree(*newxattr);
 		*newxattr = NULL;
@@ -625,11 +627,13 @@ static int __set_xattr(struct ceph_inode_info *ci,
 	if (new) {
 		rb_link_node(&xattr->node, parent, p);
 		rb_insert_color(&xattr->node, &ci->i_xattrs.index);
-		dout("__set_xattr_val p=%p\n", p);
+		dout("%s p=%p\n", __func__, p);
 	}
 
-	dout("__set_xattr_val added %llx.%llx xattr %p %.*s=%.*s\n",
-	     ceph_vinop(&ci->netfs.inode), xattr, name_len, name, val_len, val);
+	dout("%s added %llx.%llx xattr %p %.*s=%.*s%s\n", __func__,
+	     ceph_vinop(&ci->netfs.inode), xattr, name_len, name,
+	     min(val_len, MAX_XATTR_VAL_PRINT_LEN), val,
+	     val_len > MAX_XATTR_VAL_PRINT_LEN ? "..." : "");
 
 	return 0;
 }
@@ -655,13 +659,15 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
 		else if (c > 0)
 			p = &(*p)->rb_right;
 		else {
-			dout("__get_xattr %s: found %.*s\n", name,
-			     xattr->val_len, xattr->val);
+			int len = min(xattr->val_len, MAX_XATTR_VAL_PRINT_LEN);
+
+			dout("%s %s: found %.*s%s\n", __func__, name, len,
+			     xattr->val, xattr->val_len > len ? "..." : "");
 			return xattr;
 		}
 	}
 
-	dout("__get_xattr %s: not found\n", name);
+	dout("%s %s: not found\n", __func__, name);
 
 	return NULL;
 }
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 415176b2cf32..74cd6fafb33e 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -162,6 +162,6 @@ extern const struct export_operations cifs_export_ops;
 #endif /* CONFIG_CIFS_NFSD_EXPORT */
 
 /* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 41
-#define CIFS_VERSION   "2.42"
+#define SMB3_PRODUCT_BUILD 43
+#define CIFS_VERSION   "2.43"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 1cbb90587995..7bfef741f758 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1916,18 +1916,22 @@ void cifs_put_smb_ses(struct cifs_ses *ses)
 	/* ses_count can never go negative */
 	WARN_ON(ses->ses_count < 0);
 
+	spin_lock(&ses->ses_lock);
 	if (ses->ses_status == SES_GOOD)
 		ses->ses_status = SES_EXITING;
 
-	cifs_free_ipc(ses);
-
 	if (ses->ses_status == SES_EXITING && server->ops->logoff) {
+		spin_unlock(&ses->ses_lock);
+		cifs_free_ipc(ses);
 		xid = get_xid();
 		rc = server->ops->logoff(xid, ses);
 		if (rc)
 			cifs_server_dbg(VFS, "%s: Session Logoff failure rc=%d\n",
 				__func__, rc);
 		_free_xid(xid);
+	} else {
+		spin_unlock(&ses->ses_lock);
+		cifs_free_ipc(ses);
 	}
 
 	spin_lock(&cifs_tcp_ses_lock);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index b33d2e7b0f98..c5fcefdfd797 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -4882,6 +4882,8 @@ void cifs_oplock_break(struct work_struct *work)
 	struct TCP_Server_Info *server = tcon->ses->server;
 	int rc = 0;
 	bool purge_cache = false;
+	struct cifs_deferred_close *dclose;
+	bool is_deferred = false;
 
 	wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS,
 			TASK_UNINTERRUPTIBLE);
@@ -4918,6 +4920,20 @@ void cifs_oplock_break(struct work_struct *work)
 
 oplock_break_ack:
 	/*
+	 * When oplock break is received and there are no active
+	 * file handles but cached, then schedule deferred close immediately.
+	 * So, new open will not use cached handle.
+	 */
+	spin_lock(&CIFS_I(inode)->deferred_lock);
+	is_deferred = cifs_is_deferred_close(cfile, &dclose);
+	spin_unlock(&CIFS_I(inode)->deferred_lock);
+
+	if (!CIFS_CACHE_HANDLE(cinode) && is_deferred &&
+			cfile->deferred_close_scheduled && delayed_work_pending(&cfile->deferred)) {
+		cifs_close_deferred_file(cinode);
+	}
+
+	/*
 	 * releasing stale oplock after recent reconnect of smb session using
 	 * a now incorrect file handle is not a data integrity issue but do
 	 * not bother sending an oplock release if session to server still is
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 7f085ed2d866..cd914be905b2 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -749,7 +749,9 @@ cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode)
 	list_for_each_entry(cfile, &cifs_inode->openFileList, flist) {
 		if (delayed_work_pending(&cfile->deferred)) {
 			if (cancel_delayed_work(&cfile->deferred)) {
+				spin_lock(&cifs_inode->deferred_lock);
 				cifs_del_deferred_close(cfile);
+				spin_unlock(&cifs_inode->deferred_lock);
 
 				tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
 				if (tmp_list == NULL)
@@ -762,7 +764,7 @@ cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode)
 	spin_unlock(&cifs_inode->open_file_lock);
 
 	list_for_each_entry_safe(tmp_list, tmp_next_list, &file_head, list) {
-		_cifsFileInfo_put(tmp_list->cfile, true, false);
+		_cifsFileInfo_put(tmp_list->cfile, false, false);
 		list_del(&tmp_list->list);
 		kfree(tmp_list);
 	}
@@ -780,7 +782,9 @@ cifs_close_all_deferred_files(struct cifs_tcon *tcon)
 	list_for_each_entry(cfile, &tcon->openFileList, tlist) {
 		if (delayed_work_pending(&cfile->deferred)) {
 			if (cancel_delayed_work(&cfile->deferred)) {
+				spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
 				cifs_del_deferred_close(cfile);
+				spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
 
 				tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
 				if (tmp_list == NULL)
@@ -815,7 +819,9 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path)
 		if (strstr(full_path, path)) {
 			if (delayed_work_pending(&cfile->deferred)) {
 				if (cancel_delayed_work(&cfile->deferred)) {
+					spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
 					cifs_del_deferred_close(cfile);
+					spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
 
 					tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
 					if (tmp_list == NULL)
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 366f0c3b799b..0521aa1da644 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -821,7 +821,6 @@ create_posix_buf(umode_t mode)
 static int
 add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode)
 {
-	struct smb2_create_req *req = iov[0].iov_base;
 	unsigned int num = *num_iovec;
 
 	iov[num].iov_base = create_posix_buf(mode);
@@ -830,11 +829,6 @@ add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode)
 	if (iov[num].iov_base == NULL)
 		return -ENOMEM;
 	iov[num].iov_len = sizeof(struct create_posix);
-	if (!req->CreateContextsOffset)
-		req->CreateContextsOffset = cpu_to_le32(
-				sizeof(struct smb2_create_req) +
-				iov[num - 1].iov_len);
-	le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_posix));
 	*num_iovec = num + 1;
 	return 0;
 }
@@ -2069,7 +2063,7 @@ create_reconnect_durable_buf(struct cifs_fid *fid)
 static void
 parse_query_id_ctxt(struct create_context *cc, struct smb2_file_all_info *buf)
 {
-	struct create_on_disk_id *pdisk_id = (struct create_on_disk_id *)cc;
+	struct create_disk_id_rsp *pdisk_id = (struct create_disk_id_rsp *)cc;
 
 	cifs_dbg(FYI, "parse query id context 0x%llx 0x%llx\n",
 		pdisk_id->DiskFileId, pdisk_id->VolumeId);
@@ -2172,10 +2166,11 @@ smb2_parse_contexts(struct TCP_Server_Info *server,
 }
 
 static int
-add_lease_context(struct TCP_Server_Info *server, struct kvec *iov,
+add_lease_context(struct TCP_Server_Info *server,
+		  struct smb2_create_req *req,
+		  struct kvec *iov,
 		  unsigned int *num_iovec, u8 *lease_key, __u8 *oplock)
 {
-	struct smb2_create_req *req = iov[0].iov_base;
 	unsigned int num = *num_iovec;
 
 	iov[num].iov_base = server->ops->create_lease_buf(lease_key, *oplock);
@@ -2183,12 +2178,6 @@ add_lease_context(struct TCP_Server_Info *server, struct kvec *iov,
 		return -ENOMEM;
 	iov[num].iov_len = server->vals->create_lease_size;
 	req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE;
-	if (!req->CreateContextsOffset)
-		req->CreateContextsOffset = cpu_to_le32(
-				sizeof(struct smb2_create_req) +
-				iov[num - 1].iov_len);
-	le32_add_cpu(&req->CreateContextsLength,
-		     server->vals->create_lease_size);
 	*num_iovec = num + 1;
 	return 0;
 }
@@ -2267,18 +2256,12 @@ static int
 add_durable_v2_context(struct kvec *iov, unsigned int *num_iovec,
 		    struct cifs_open_parms *oparms)
 {
-	struct smb2_create_req *req = iov[0].iov_base;
 	unsigned int num = *num_iovec;
 
 	iov[num].iov_base = create_durable_v2_buf(oparms);
 	if (iov[num].iov_base == NULL)
 		return -ENOMEM;
 	iov[num].iov_len = sizeof(struct create_durable_v2);
-	if (!req->CreateContextsOffset)
-		req->CreateContextsOffset =
-			cpu_to_le32(sizeof(struct smb2_create_req) +
-								iov[1].iov_len);
-	le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable_v2));
 	*num_iovec = num + 1;
 	return 0;
 }
@@ -2287,7 +2270,6 @@ static int
 add_durable_reconnect_v2_context(struct kvec *iov, unsigned int *num_iovec,
 		    struct cifs_open_parms *oparms)
 {
-	struct smb2_create_req *req = iov[0].iov_base;
 	unsigned int num = *num_iovec;
 
 	/* indicate that we don't need to relock the file */
@@ -2297,12 +2279,6 @@ add_durable_reconnect_v2_context(struct kvec *iov, unsigned int *num_iovec,
 	if (iov[num].iov_base == NULL)
 		return -ENOMEM;
 	iov[num].iov_len = sizeof(struct create_durable_handle_reconnect_v2);
-	if (!req->CreateContextsOffset)
-		req->CreateContextsOffset =
-			cpu_to_le32(sizeof(struct smb2_create_req) +
-								iov[1].iov_len);
-	le32_add_cpu(&req->CreateContextsLength,
-			sizeof(struct create_durable_handle_reconnect_v2));
 	*num_iovec = num + 1;
 	return 0;
 }
@@ -2311,7 +2287,6 @@ static int
 add_durable_context(struct kvec *iov, unsigned int *num_iovec,
 		    struct cifs_open_parms *oparms, bool use_persistent)
 {
-	struct smb2_create_req *req = iov[0].iov_base;
 	unsigned int num = *num_iovec;
 
 	if (use_persistent) {
@@ -2331,11 +2306,6 @@ add_durable_context(struct kvec *iov, unsigned int *num_iovec,
 	if (iov[num].iov_base == NULL)
 		return -ENOMEM;
 	iov[num].iov_len = sizeof(struct create_durable);
-	if (!req->CreateContextsOffset)
-		req->CreateContextsOffset =
-			cpu_to_le32(sizeof(struct smb2_create_req) +
-								iov[1].iov_len);
-	le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable));
 	*num_iovec = num + 1;
 	return 0;
 }
@@ -2369,18 +2339,12 @@ create_twarp_buf(__u64 timewarp)
 static int
 add_twarp_context(struct kvec *iov, unsigned int *num_iovec, __u64 timewarp)
 {
-	struct smb2_create_req *req = iov[0].iov_base;
 	unsigned int num = *num_iovec;
 
 	iov[num].iov_base = create_twarp_buf(timewarp);
 	if (iov[num].iov_base == NULL)
 		return -ENOMEM;
 	iov[num].iov_len = sizeof(struct crt_twarp_ctxt);
-	if (!req->CreateContextsOffset)
-		req->CreateContextsOffset = cpu_to_le32(
-				sizeof(struct smb2_create_req) +
-				iov[num - 1].iov_len);
-	le32_add_cpu(&req->CreateContextsLength, sizeof(struct crt_twarp_ctxt));
 	*num_iovec = num + 1;
 	return 0;
 }
@@ -2503,7 +2467,6 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
 static int
 add_sd_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode, bool set_owner)
 {
-	struct smb2_create_req *req = iov[0].iov_base;
 	unsigned int num = *num_iovec;
 	unsigned int len = 0;
 
@@ -2511,11 +2474,6 @@ add_sd_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode, bool set
 	if (iov[num].iov_base == NULL)
 		return -ENOMEM;
 	iov[num].iov_len = len;
-	if (!req->CreateContextsOffset)
-		req->CreateContextsOffset = cpu_to_le32(
-				sizeof(struct smb2_create_req) +
-				iov[num - 1].iov_len);
-	le32_add_cpu(&req->CreateContextsLength, len);
 	*num_iovec = num + 1;
 	return 0;
 }
@@ -2546,18 +2504,12 @@ create_query_id_buf(void)
 static int
 add_query_id_context(struct kvec *iov, unsigned int *num_iovec)
 {
-	struct smb2_create_req *req = iov[0].iov_base;
 	unsigned int num = *num_iovec;
 
 	iov[num].iov_base = create_query_id_buf();
 	if (iov[num].iov_base == NULL)
 		return -ENOMEM;
 	iov[num].iov_len = sizeof(struct crt_query_id_ctxt);
-	if (!req->CreateContextsOffset)
-		req->CreateContextsOffset = cpu_to_le32(
-				sizeof(struct smb2_create_req) +
-				iov[num - 1].iov_len);
-	le32_add_cpu(&req->CreateContextsLength, sizeof(struct crt_query_id_ctxt));
 	*num_iovec = num + 1;
 	return 0;
 }
@@ -2720,6 +2672,9 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
 		rc = add_posix_context(iov, &n_iov, mode);
 		if (rc)
 			goto err_free_req;
+		req->CreateContextsOffset = cpu_to_le32(
+			sizeof(struct smb2_create_req) +
+			iov[1].iov_len);
 		pc_buf = iov[n_iov-1].iov_base;
 	}
 
@@ -2857,21 +2812,13 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
 		  (oparms->create_options & CREATE_NOT_FILE))
 		req->RequestedOplockLevel = *oplock; /* no srv lease support */
 	else {
-		rc = add_lease_context(server, iov, &n_iov,
+		rc = add_lease_context(server, req, iov, &n_iov,
 				       oparms->fid->lease_key, oplock);
 		if (rc)
 			return rc;
 	}
 
 	if (*oplock == SMB2_OPLOCK_LEVEL_BATCH) {
-		/* need to set Next field of lease context if we request it */
-		if (server->capabilities & SMB2_GLOBAL_CAP_LEASING) {
-			struct create_context *ccontext =
-			    (struct create_context *)iov[n_iov-1].iov_base;
-			ccontext->Next =
-				cpu_to_le32(server->vals->create_lease_size);
-		}
-
 		rc = add_durable_context(iov, &n_iov, oparms,
 					tcon->use_persistent);
 		if (rc)
@@ -2879,13 +2826,6 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
 	}
 
 	if (tcon->posix_extensions) {
-		if (n_iov > 2) {
-			struct create_context *ccontext =
-			    (struct create_context *)iov[n_iov-1].iov_base;
-			ccontext->Next =
-				cpu_to_le32(iov[n_iov-1].iov_len);
-		}
-
 		rc = add_posix_context(iov, &n_iov, oparms->mode);
 		if (rc)
 			return rc;
@@ -2893,13 +2833,6 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
 
 	if (tcon->snapshot_time) {
 		cifs_dbg(FYI, "adding snapshot context\n");
-		if (n_iov > 2) {
-			struct create_context *ccontext =
-			    (struct create_context *)iov[n_iov-1].iov_base;
-			ccontext->Next =
-				cpu_to_le32(iov[n_iov-1].iov_len);
-		}
-
 		rc = add_twarp_context(iov, &n_iov, tcon->snapshot_time);
 		if (rc)
 			return rc;
@@ -2923,12 +2856,6 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
 			set_owner = false;
 
 		if (set_owner | set_mode) {
-			if (n_iov > 2) {
-				struct create_context *ccontext =
-				    (struct create_context *)iov[n_iov-1].iov_base;
-				ccontext->Next = cpu_to_le32(iov[n_iov-1].iov_len);
-			}
-
 			cifs_dbg(FYI, "add sd with mode 0x%x\n", oparms->mode);
 			rc = add_sd_context(iov, &n_iov, oparms->mode, set_owner);
 			if (rc)
@@ -2936,12 +2863,30 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
 		}
 	}
 
+	add_query_id_context(iov, &n_iov);
+
 	if (n_iov > 2) {
-		struct create_context *ccontext =
-			(struct create_context *)iov[n_iov-1].iov_base;
-		ccontext->Next = cpu_to_le32(iov[n_iov-1].iov_len);
+		/*
+		 * We have create contexts behind iov[1] (the file
+		 * name), point at them from the main create request
+		 */
+		req->CreateContextsOffset = cpu_to_le32(
+			sizeof(struct smb2_create_req) +
+			iov[1].iov_len);
+		req->CreateContextsLength = 0;
+
+		for (unsigned int i = 2; i < (n_iov-1); i++) {
+			struct kvec *v = &iov[i];
+			size_t len = v->iov_len;
+			struct create_context *cctx =
+				(struct create_context *)v->iov_base;
+
+			cctx->Next = cpu_to_le32(len);
+			le32_add_cpu(&req->CreateContextsLength, len);
+		}
+		le32_add_cpu(&req->CreateContextsLength,
+			     iov[n_iov-1].iov_len);
 	}
-	add_query_id_context(iov, &n_iov);
 
 	rqst->rq_nvec = n_iov;
 	return 0;
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 2114e8a0c63a..220994d0a0f7 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -132,17 +132,6 @@ struct share_redirect_error_context_rsp {
 #define SMB2_LEASE_HANDLE_CACHING_HE	0x02
 #define SMB2_LEASE_WRITE_CACHING_HE	0x04
 
-struct create_durable {
-	struct create_context ccontext;
-	__u8   Name[8];
-	union {
-		__u8  Reserved[16];
-		struct {
-			__u64 PersistentFileId;
-			__u64 VolatileFileId;
-		} Fid;
-	} Data;
-} __packed;
 
 /* See MS-SMB2 2.2.13.2.11 */
 /* Flags */
@@ -170,15 +159,6 @@ struct durable_reconnect_context_v2 {
 	__le32 Flags; /* see above DHANDLE_FLAG_PERSISTENT */
 } __packed;
 
-/* See MS-SMB2 2.2.14.2.9 */
-struct create_on_disk_id {
-	struct create_context ccontext;
-	__u8   Name[8];
-	__le64 DiskFileId;
-	__le64 VolumeId;
-	__u32  Reserved[4];
-} __packed;
-
 /* See MS-SMB2 2.2.14.2.12 */
 struct durable_reconnect_context_v2_rsp {
 	__le32 Timeout;
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index b39580ad4ce5..3c3148588491 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -361,7 +361,7 @@ static int __init init_coda_psdev(void)
 		       __func__, CODA_PSDEV_MAJOR);
 		return -EIO;
 	}
-	coda_psdev_class = class_create(THIS_MODULE, "coda");
+	coda_psdev_class = class_create("coda");
 	if (IS_ERR(coda_psdev_class)) {
 		err = PTR_ERR(coda_psdev_class);
 		goto out_chrdev;
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index fda3b702b1c5..a247c14aaab7 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -39,19 +39,10 @@ static struct ctl_table coda_table[] = {
 	{}
 };
 
-static struct ctl_table fs_table[] = {
-	{
-		.procname	= "coda",
-		.mode		= 0555,
-		.child		= coda_table
-	},
-	{}
-};
-
 void coda_sysctl_init(void)
 {
 	if ( !fs_table_header )
-		fs_table_header = register_sysctl_table(fs_table);
+		fs_table_header = register_sysctl("coda", coda_table);
 }
 
 void coda_sysctl_clean(void)
diff --git a/fs/coredump.c b/fs/coredump.c
index 5df1e6e1eb2b..ece7badf701b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -882,6 +882,7 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
 	pos = file->f_pos;
 	bvec_set_page(&bvec, page, PAGE_SIZE, 0);
 	iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE);
+	iov_iter_set_copy_mc(&iter);
 	n = __kernel_write_iter(cprm->file, &iter, &pos);
 	if (n != PAGE_SIZE)
 		return 0;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4f757a71f99b..980483455cc0 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -43,7 +43,7 @@
  * LOCKING:
  * There are three level of locking required by epoll :
  *
- * 1) epmutex (mutex)
+ * 1) epnested_mutex (mutex)
  * 2) ep->mtx (mutex)
  * 3) ep->lock (rwlock)
  *
@@ -57,14 +57,8 @@
  * we need a lock that will allow us to sleep. This lock is a
  * mutex (ep->mtx). It is acquired during the event transfer loop,
  * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
- * Then we also need a global mutex to serialize eventpoll_release_file()
- * and ep_free().
- * This mutex is acquired by ep_free() during the epoll file
- * cleanup path and it is also acquired by eventpoll_release_file()
- * if a file has been pushed inside an epoll set and it is then
- * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
- * It is also acquired when inserting an epoll fd onto another epoll
- * fd. We do this so that we walk the epoll tree and ensure that this
+ * The epnested_mutex is acquired when inserting an epoll fd onto another
+ * epoll fd. We do this so that we walk the epoll tree and ensure that this
  * insertion does not create a cycle of epoll file descriptors, which
  * could lead to deadlock. We need a global mutex to prevent two
  * simultaneous inserts (A into B and B into A) from racing and
@@ -80,9 +74,9 @@
  * of epoll file descriptors, we use the current recursion depth as
  * the lockdep subkey.
  * It is possible to drop the "ep->mtx" and to use the global
- * mutex "epmutex" (together with "ep->lock") to have it working,
+ * mutex "epnested_mutex" (together with "ep->lock") to have it working,
  * but having "ep->mtx" will make the interface more scalable.
- * Events that require holding "epmutex" are very rare, while for
+ * Events that require holding "epnested_mutex" are very rare, while for
  * normal operations the epoll private "ep->mtx" will guarantee
  * a better scalability.
  */
@@ -153,6 +147,13 @@ struct epitem {
 	/* The file descriptor information this item refers to */
 	struct epoll_filefd ffd;
 
+	/*
+	 * Protected by file->f_lock, true for to-be-released epitem already
+	 * removed from the "struct file" items list; together with
+	 * eventpoll->refcount orchestrates "struct eventpoll" disposal
+	 */
+	bool dying;
+
 	/* List containing poll wait queues */
 	struct eppoll_entry *pwqlist;
 
@@ -217,6 +218,12 @@ struct eventpoll {
 	u64 gen;
 	struct hlist_head refs;
 
+	/*
+	 * usage count, used together with epitem->dying to
+	 * orchestrate the disposal of this struct
+	 */
+	refcount_t refcount;
+
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	/* used to track busy poll napi_id */
 	unsigned int napi_id;
@@ -240,10 +247,8 @@ struct ep_pqueue {
 /* Maximum number of epoll watched descriptors, per user */
 static long max_user_watches __read_mostly;
 
-/*
- * This mutex is used to serialize ep_free() and eventpoll_release_file().
- */
-static DEFINE_MUTEX(epmutex);
+/* Used for cycles detection */
+static DEFINE_MUTEX(epnested_mutex);
 
 static u64 loop_check_gen = 0;
 
@@ -258,7 +263,7 @@ static struct kmem_cache *pwq_cache __read_mostly;
 
 /*
  * List of files with newly added links, where we may need to limit the number
- * of emanating paths. Protected by the epmutex.
+ * of emanating paths. Protected by the epnested_mutex.
  */
 struct epitems_head {
 	struct hlist_head epitems;
@@ -557,8 +562,7 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq)
 
 /*
  * This function unregisters poll callbacks from the associated file
- * descriptor.  Must be called with "mtx" held (or "epmutex" if called from
- * ep_free).
+ * descriptor.  Must be called with "mtx" held.
  */
 static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
 {
@@ -681,11 +685,40 @@ static void epi_rcu_free(struct rcu_head *head)
 	kmem_cache_free(epi_cache, epi);
 }
 
+static void ep_get(struct eventpoll *ep)
+{
+	refcount_inc(&ep->refcount);
+}
+
+/*
+ * Returns true if the event poll can be disposed
+ */
+static bool ep_refcount_dec_and_test(struct eventpoll *ep)
+{
+	if (!refcount_dec_and_test(&ep->refcount))
+		return false;
+
+	WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root));
+	return true;
+}
+
+static void ep_free(struct eventpoll *ep)
+{
+	mutex_destroy(&ep->mtx);
+	free_uid(ep->user);
+	wakeup_source_unregister(ep->ws);
+	kfree(ep);
+}
+
 /*
  * Removes a "struct epitem" from the eventpoll RB tree and deallocates
  * all the associated resources. Must be called with "mtx" held.
+ * If the dying flag is set, do the removal only if force is true.
+ * This prevents ep_clear_and_put() from dropping all the ep references
+ * while running concurrently with eventpoll_release_file().
+ * Returns true if the eventpoll can be disposed.
  */
-static int ep_remove(struct eventpoll *ep, struct epitem *epi)
+static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
 {
 	struct file *file = epi->ffd.file;
 	struct epitems_head *to_free;
@@ -700,6 +733,11 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 
 	/* Remove the current item from the list of epoll hooks */
 	spin_lock(&file->f_lock);
+	if (epi->dying && !force) {
+		spin_unlock(&file->f_lock);
+		return false;
+	}
+
 	to_free = NULL;
 	head = file->f_ep;
 	if (head->first == &epi->fllink && !epi->fllink.next) {
@@ -733,28 +771,28 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 	call_rcu(&epi->rcu, epi_rcu_free);
 
 	percpu_counter_dec(&ep->user->epoll_watches);
+	return ep_refcount_dec_and_test(ep);
+}
 
-	return 0;
+/*
+ * ep_remove variant for callers owing an additional reference to the ep
+ */
+static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi)
+{
+	WARN_ON_ONCE(__ep_remove(ep, epi, false));
 }
 
-static void ep_free(struct eventpoll *ep)
+static void ep_clear_and_put(struct eventpoll *ep)
 {
-	struct rb_node *rbp;
+	struct rb_node *rbp, *next;
 	struct epitem *epi;
+	bool dispose;
 
 	/* We need to release all tasks waiting for these file */
 	if (waitqueue_active(&ep->poll_wait))
 		ep_poll_safewake(ep, NULL, 0);
 
-	/*
-	 * We need to lock this because we could be hit by
-	 * eventpoll_release_file() while we're freeing the "struct eventpoll".
-	 * We do not need to hold "ep->mtx" here because the epoll file
-	 * is on the way to be removed and no one has references to it
-	 * anymore. The only hit might come from eventpoll_release_file() but
-	 * holding "epmutex" is sufficient here.
-	 */
-	mutex_lock(&epmutex);
+	mutex_lock(&ep->mtx);
 
 	/*
 	 * Walks through the whole tree by unregistering poll callbacks.
@@ -767,26 +805,25 @@ static void ep_free(struct eventpoll *ep)
 	}
 
 	/*
-	 * Walks through the whole tree by freeing each "struct epitem". At this
-	 * point we are sure no poll callbacks will be lingering around, and also by
-	 * holding "epmutex" we can be sure that no file cleanup code will hit
-	 * us during this operation. So we can avoid the lock on "ep->lock".
-	 * We do not need to lock ep->mtx, either, we only do it to prevent
-	 * a lockdep warning.
+	 * Walks through the whole tree and try to free each "struct epitem".
+	 * Note that ep_remove_safe() will not remove the epitem in case of a
+	 * racing eventpoll_release_file(); the latter will do the removal.
+	 * At this point we are sure no poll callbacks will be lingering around.
+	 * Since we still own a reference to the eventpoll struct, the loop can't
+	 * dispose it.
 	 */
-	mutex_lock(&ep->mtx);
-	while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
+	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) {
+		next = rb_next(rbp);
 		epi = rb_entry(rbp, struct epitem, rbn);
-		ep_remove(ep, epi);
+		ep_remove_safe(ep, epi);
 		cond_resched();
 	}
+
+	dispose = ep_refcount_dec_and_test(ep);
 	mutex_unlock(&ep->mtx);
 
-	mutex_unlock(&epmutex);
-	mutex_destroy(&ep->mtx);
-	free_uid(ep->user);
-	wakeup_source_unregister(ep->ws);
-	kfree(ep);
+	if (dispose)
+		ep_free(ep);
 }
 
 static int ep_eventpoll_release(struct inode *inode, struct file *file)
@@ -794,7 +831,7 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
 	struct eventpoll *ep = file->private_data;
 
 	if (ep)
-		ep_free(ep);
+		ep_clear_and_put(ep);
 
 	return 0;
 }
@@ -906,33 +943,34 @@ void eventpoll_release_file(struct file *file)
 {
 	struct eventpoll *ep;
 	struct epitem *epi;
-	struct hlist_node *next;
+	bool dispose;
 
 	/*
-	 * We don't want to get "file->f_lock" because it is not
-	 * necessary. It is not necessary because we're in the "struct file"
-	 * cleanup path, and this means that no one is using this file anymore.
-	 * So, for example, epoll_ctl() cannot hit here since if we reach this
-	 * point, the file counter already went to zero and fget() would fail.
-	 * The only hit might come from ep_free() but by holding the mutex
-	 * will correctly serialize the operation. We do need to acquire
-	 * "ep->mtx" after "epmutex" because ep_remove() requires it when called
-	 * from anywhere but ep_free().
-	 *
-	 * Besides, ep_remove() acquires the lock, so we can't hold it here.
+	 * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from
+	 * touching the epitems list before eventpoll_release_file() can access
+	 * the ep->mtx.
 	 */
-	mutex_lock(&epmutex);
-	if (unlikely(!file->f_ep)) {
-		mutex_unlock(&epmutex);
-		return;
-	}
-	hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) {
+again:
+	spin_lock(&file->f_lock);
+	if (file->f_ep && file->f_ep->first) {
+		epi = hlist_entry(file->f_ep->first, struct epitem, fllink);
+		epi->dying = true;
+		spin_unlock(&file->f_lock);
+
+		/*
+		 * ep access is safe as we still own a reference to the ep
+		 * struct
+		 */
 		ep = epi->ep;
-		mutex_lock_nested(&ep->mtx, 0);
-		ep_remove(ep, epi);
+		mutex_lock(&ep->mtx);
+		dispose = __ep_remove(ep, epi, true);
 		mutex_unlock(&ep->mtx);
+
+		if (dispose)
+			ep_free(ep);
+		goto again;
 	}
-	mutex_unlock(&epmutex);
+	spin_unlock(&file->f_lock);
 }
 
 static int ep_alloc(struct eventpoll **pep)
@@ -955,6 +993,7 @@ static int ep_alloc(struct eventpoll **pep)
 	ep->rbr = RB_ROOT_CACHED;
 	ep->ovflist = EP_UNACTIVE_PTR;
 	ep->user = user;
+	refcount_set(&ep->refcount, 1);
 
 	*pep = ep;
 
@@ -1223,10 +1262,10 @@ out_unlock:
 		 */
 		list_del_init(&wait->entry);
 		/*
-		 * ->whead != NULL protects us from the race with ep_free()
-		 * or ep_remove(), ep_remove_wait_queue() takes whead->lock
-		 * held by the caller. Once we nullify it, nothing protects
-		 * ep/epi or even wait.
+		 * ->whead != NULL protects us from the race with
+		 * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue()
+		 * takes whead->lock held by the caller. Once we nullify it,
+		 * nothing protects ep/epi or even wait.
 		 */
 		smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
 	}
@@ -1298,7 +1337,7 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
  * is connected to n file sources. In this case each file source has 1 path
  * of length 1. Thus, the numbers below should be more than sufficient. These
  * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
- * and delete can't add additional paths. Protected by the epmutex.
+ * and delete can't add additional paths. Protected by the epnested_mutex.
  */
 static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
 static int path_count[PATH_ARR_SIZE];
@@ -1496,16 +1535,22 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 	if (tep)
 		mutex_unlock(&tep->mtx);
 
+	/*
+	 * ep_remove_safe() calls in the later error paths can't lead to
+	 * ep_free() as the ep file itself still holds an ep reference.
+	 */
+	ep_get(ep);
+
 	/* now check if we've created too many backpaths */
 	if (unlikely(full_check && reverse_path_check())) {
-		ep_remove(ep, epi);
+		ep_remove_safe(ep, epi);
 		return -EINVAL;
 	}
 
 	if (epi->event.events & EPOLLWAKEUP) {
 		error = ep_create_wakeup_source(epi);
 		if (error) {
-			ep_remove(ep, epi);
+			ep_remove_safe(ep, epi);
 			return error;
 		}
 	}
@@ -1529,7 +1574,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 	 * high memory pressure.
 	 */
 	if (unlikely(!epq.epi)) {
-		ep_remove(ep, epi);
+		ep_remove_safe(ep, epi);
 		return -ENOMEM;
 	}
 
@@ -2025,7 +2070,7 @@ static int do_epoll_create(int flags)
 out_free_fd:
 	put_unused_fd(fd);
 out_free_ep:
-	ep_free(ep);
+	ep_clear_and_put(ep);
 	return error;
 }
 
@@ -2135,7 +2180,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 	 * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
 	 * the epoll file descriptor is attaching directly to a wakeup source,
 	 * unless the epoll file descriptor is nested. The purpose of taking the
-	 * 'epmutex' on add is to prevent complex toplogies such as loops and
+	 * 'epnested_mutex' on add is to prevent complex toplogies such as loops and
 	 * deep wakeup paths from forming in parallel through multiple
 	 * EPOLL_CTL_ADD operations.
 	 */
@@ -2146,7 +2191,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 		if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
 		    is_file_epoll(tf.file)) {
 			mutex_unlock(&ep->mtx);
-			error = epoll_mutex_lock(&epmutex, 0, nonblock);
+			error = epoll_mutex_lock(&epnested_mutex, 0, nonblock);
 			if (error)
 				goto error_tgt_fput;
 			loop_check_gen++;
@@ -2180,10 +2225,16 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 			error = -EEXIST;
 		break;
 	case EPOLL_CTL_DEL:
-		if (epi)
-			error = ep_remove(ep, epi);
-		else
+		if (epi) {
+			/*
+			 * The eventpoll itself is still alive: the refcount
+			 * can't go to zero here.
+			 */
+			ep_remove_safe(ep, epi);
+			error = 0;
+		} else {
 			error = -ENOENT;
+		}
 		break;
 	case EPOLL_CTL_MOD:
 		if (epi) {
@@ -2201,7 +2252,7 @@ error_tgt_fput:
 	if (full_check) {
 		clear_tfile_check_list();
 		loop_check_gen++;
-		mutex_unlock(&epmutex);
+		mutex_unlock(&epnested_mutex);
 	}
 
 	fdput(tf);
diff --git a/fs/exec.c b/fs/exec.c
index 7c44d0c65b1b..a466e797c8e2 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -65,6 +65,7 @@
 #include <linux/syscall_user_dispatch.h>
 #include <linux/coredump.h>
 #include <linux/time_namespace.h>
+#include <linux/user_events.h>
 
 #include <linux/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1034,7 +1035,7 @@ static int exec_mmap(struct mm_struct *mm)
 		mmput(old_mm);
 		return 0;
 	}
-	mmdrop(active_mm);
+	mmdrop_lazy_tlb(active_mm);
 	return 0;
 }
 
@@ -1859,6 +1860,7 @@ static int bprm_execve(struct linux_binprm *bprm,
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
 	rseq_execve(current);
+	user_events_execve(current);
 	acct_update_integrals(current);
 	task_numa_free(current, false);
 	return retval;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e79c767cc5e0..35703dce23a3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5795,7 +5795,8 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
 	 * mapped - no physical clusters have been allocated, and the
 	 * file has no extents
 	 */
-	if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
+	if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) ||
+	    ext4_has_inline_data(inode))
 		return 0;
 
 	/* search for the extent closest to the first block in the cluster */
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index b9fb1177fff6..859bc4e2c9b0 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -566,9 +566,9 @@ retry:
 	 * started */
 	folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
 			mapping_gfp_mask(mapping));
-	if (!folio) {
-		ret = -ENOMEM;
-		goto out;
+	if (IS_ERR(folio)) {
+		ret = PTR_ERR(folio);
+		goto out_nofolio;
 	}
 
 	ext4_write_lock_xattr(inode, &no_expand);
@@ -633,6 +633,7 @@ out:
 		folio_unlock(folio);
 		folio_put(folio);
 	}
+out_nofolio:
 	if (sem_held)
 		ext4_write_unlock_xattr(inode, &no_expand);
 	if (handle)
@@ -693,8 +694,8 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
 
 	folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
 					mapping_gfp_mask(mapping));
-	if (!folio) {
-		ret = -ENOMEM;
+	if (IS_ERR(folio)) {
+		ret = PTR_ERR(folio);
 		goto out;
 	}
 
@@ -854,8 +855,8 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
 
 	folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN,
 					mapping_gfp_mask(mapping));
-	if (!folio)
-		return -ENOMEM;
+	if (IS_ERR(folio))
+		return PTR_ERR(folio);
 
 	down_read(&EXT4_I(inode)->xattr_sem);
 	if (!ext4_has_inline_data(inode)) {
@@ -947,8 +948,8 @@ retry_journal:
 	 */
 	folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
 					mapping_gfp_mask(mapping));
-	if (!folio) {
-		ret = -ENOMEM;
+	if (IS_ERR(folio)) {
+		ret = PTR_ERR(folio);
 		goto out_journal;
 	}
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8dbd352e3986..0d5ba922e411 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1162,8 +1162,8 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 retry_grab:
 	folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
 					mapping_gfp_mask(mapping));
-	if (!folio)
-		return -ENOMEM;
+	if (IS_ERR(folio))
+		return PTR_ERR(folio);
 	/*
 	 * The same as page allocation, we prealloc buffer heads before
 	 * starting the handle.
@@ -2906,8 +2906,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 retry:
 	folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
 			mapping_gfp_mask(mapping));
-	if (!folio)
-		return -ENOMEM;
+	if (IS_ERR(folio))
+		return PTR_ERR(folio);
 
 	/* In case writeback began while the folio was unlocked */
 	folio_wait_stable(folio);
@@ -2982,6 +2982,9 @@ static int ext4_da_write_end(struct file *file,
 	    ext4_has_inline_data(inode))
 		return ext4_write_inline_data_end(inode, pos, len, copied, page);
 
+	if (unlikely(copied < len) && !PageUptodate(page))
+		copied = 0;
+
 	start = pos & (PAGE_SIZE - 1);
 	end = start + copied - 1;
 
@@ -3618,8 +3621,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
 	folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT,
 				    FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
 				    mapping_gfp_constraint(mapping, ~__GFP_FS));
-	if (!folio)
-		return -ENOMEM;
+	if (IS_ERR(folio))
+		return PTR_ERR(folio);
 
 	blocksize = inode->i_sb->s_blocksize;
 
@@ -5201,7 +5204,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
 	while (1) {
 		struct folio *folio = filemap_lock_folio(inode->i_mapping,
 				      inode->i_size >> PAGE_SHIFT);
-		if (!folio)
+		if (IS_ERR(folio))
 			return;
 		ret = __ext4_journalled_invalidate_folio(folio, offset,
 						folio_size(folio) - offset);
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 4681fff6665f..4022bc713421 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -282,6 +282,7 @@ int ext4_multi_mount_protect(struct super_block *sb,
 	if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
 	    mmp_block >= ext4_blocks_count(es)) {
 		ext4_warning(sb, "Invalid MMP block in superblock");
+		retval = -EINVAL;
 		goto failed;
 	}
 
@@ -307,6 +308,7 @@ int ext4_multi_mount_protect(struct super_block *sb,
 
 	if (seq == EXT4_MMP_SEQ_FSCK) {
 		dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
+		retval = -EBUSY;
 		goto failed;
 	}
 
@@ -320,6 +322,7 @@ int ext4_multi_mount_protect(struct super_block *sb,
 
 	if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
 		ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+		retval = -ETIMEDOUT;
 		goto failed;
 	}
 
@@ -330,6 +333,7 @@ int ext4_multi_mount_protect(struct super_block *sb,
 	if (seq != le32_to_cpu(mmp->mmp_seq)) {
 		dump_mmp_msg(sb, mmp,
 			     "Device is already active on another node.");
+		retval = -EBUSY;
 		goto failed;
 	}
 
@@ -349,6 +353,7 @@ skip:
 	 */
 	if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
 		ext4_warning(sb, "MMP startup interrupted, failing mount");
+		retval = -ETIMEDOUT;
 		goto failed;
 	}
 
@@ -359,6 +364,7 @@ skip:
 	if (seq != le32_to_cpu(mmp->mmp_seq)) {
 		dump_mmp_msg(sb, mmp,
 			     "Device is already active on another node.");
+		retval = -EBUSY;
 		goto failed;
 	}
 
@@ -378,6 +384,7 @@ skip:
 		EXT4_SB(sb)->s_mmp_tsk = NULL;
 		ext4_warning(sb, "Unable to create kmmpd thread for %s.",
 			     sb->s_id);
+		retval = -ENOMEM;
 		goto failed;
 	}
 
@@ -385,5 +392,5 @@ skip:
 
 failed:
 	brelse(bh);
-	return 1;
+	return retval;
 }
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index e509c22a21ed..b5af2fc03b2f 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -140,18 +140,18 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2,
 	flags = memalloc_nofs_save();
 	folio[0] = __filemap_get_folio(mapping[0], index1, FGP_WRITEBEGIN,
 			mapping_gfp_mask(mapping[0]));
-	if (!folio[0]) {
+	if (IS_ERR(folio[0])) {
 		memalloc_nofs_restore(flags);
-		return -ENOMEM;
+		return PTR_ERR(folio[0]);
 	}
 
 	folio[1] = __filemap_get_folio(mapping[1], index2, FGP_WRITEBEGIN,
 			mapping_gfp_mask(mapping[1]));
 	memalloc_nofs_restore(flags);
-	if (!folio[1]) {
+	if (IS_ERR(folio[1])) {
 		folio_unlock(folio[0]);
 		folio_put(folio[0]);
-		return -ENOMEM;
+		return PTR_ERR(folio[1]);
 	}
 	/*
 	 * __filemap_get_folio() may not wait on folio's writeback if
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d03bf0ecf505..d39f386e9baf 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1259,7 +1259,7 @@ static void ext4_put_super(struct super_block *sb)
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
 	int aborted = 0;
-	int i, err;
+	int err;
 
 	/*
 	 * Unregister sysfs before destroying jbd2 journal.
@@ -1311,7 +1311,7 @@ static void ext4_put_super(struct super_block *sb)
 	ext4_flex_groups_free(sbi);
 	ext4_percpu_param_destroy(sbi);
 #ifdef CONFIG_QUOTA
-	for (i = 0; i < EXT4_MAXQUOTAS; i++)
+	for (int i = 0; i < EXT4_MAXQUOTAS; i++)
 		kfree(get_qf_name(sb, sbi, i));
 #endif
 
@@ -5196,10 +5196,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_fsblk_t logical_sb_block;
 	struct inode *root;
-	int ret = -ENOMEM;
-	unsigned int i;
 	int needs_recovery;
-	int err = 0;
+	int err;
 	ext4_group_t first_not_zeroed;
 	struct ext4_fs_context *ctx = fc->fs_private;
 	int silent = fc->sb_flags & SB_SILENT;
@@ -5212,8 +5210,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 	sbi->s_sectors_written_start =
 		part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);
 
-	/* -EINVAL is default */
-	ret = -EINVAL;
 	err = ext4_load_super(sb, &logical_sb_block, silent);
 	if (err)
 		goto out_fail;
@@ -5239,7 +5235,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 	 */
 	sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
 
-	if (ext4_inode_info_init(sb, es))
+	err = ext4_inode_info_init(sb, es);
+	if (err)
 		goto failed_mount;
 
 	err = parse_apply_sb_mount_options(sb, ctx);
@@ -5255,10 +5252,12 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 
 	ext4_apply_options(fc, sb);
 
-	if (ext4_encoding_init(sb, es))
+	err = ext4_encoding_init(sb, es);
+	if (err)
 		goto failed_mount;
 
-	if (ext4_check_journal_data_mode(sb))
+	err = ext4_check_journal_data_mode(sb);
+	if (err)
 		goto failed_mount;
 
 	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
@@ -5267,18 +5266,22 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 	/* i_version is always enabled now */
 	sb->s_flags |= SB_I_VERSION;
 
-	if (ext4_check_feature_compatibility(sb, es, silent))
+	err = ext4_check_feature_compatibility(sb, es, silent);
+	if (err)
 		goto failed_mount;
 
-	if (ext4_block_group_meta_init(sb, silent))
+	err = ext4_block_group_meta_init(sb, silent);
+	if (err)
 		goto failed_mount;
 
 	ext4_hash_info_init(sb);
 
-	if (ext4_handle_clustersize(sb))
+	err = ext4_handle_clustersize(sb);
+	if (err)
 		goto failed_mount;
 
-	if (ext4_check_geometry(sb, es))
+	err = ext4_check_geometry(sb, es);
+	if (err)
 		goto failed_mount;
 
 	timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
@@ -5289,8 +5292,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 	if (err)
 		goto failed_mount3;
 
-	/* Register extent status tree shrinker */
-	if (ext4_es_register_shrinker(sbi))
+	err = ext4_es_register_shrinker(sbi);
+	if (err)
 		goto failed_mount3;
 
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
@@ -5329,10 +5332,13 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 			  ext4_has_feature_orphan_present(sb) ||
 			  ext4_has_feature_journal_needs_recovery(sb));
 
-	if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb))
-		if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
+	if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) {
+		err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block));
+		if (err)
 			goto failed_mount3a;
+	}
 
+	err = -EINVAL;
 	/*
 	 * The first inode we look at is the journal inode.  Don't try
 	 * root first: it may be modified in the journal!
@@ -5384,6 +5390,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 		if (!sbi->s_ea_block_cache) {
 			ext4_msg(sb, KERN_ERR,
 				 "Failed to create ea_block_cache");
+			err = -EINVAL;
 			goto failed_mount_wq;
 		}
 
@@ -5392,6 +5399,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 			if (!sbi->s_ea_inode_cache) {
 				ext4_msg(sb, KERN_ERR,
 					 "Failed to create ea_inode_cache");
+				err = -EINVAL;
 				goto failed_mount_wq;
 			}
 		}
@@ -5426,7 +5434,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 		alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
 	if (!EXT4_SB(sb)->rsv_conversion_wq) {
 		printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
-		ret = -ENOMEM;
+		err = -ENOMEM;
 		goto failed_mount4;
 	}
 
@@ -5438,28 +5446,28 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 	root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL);
 	if (IS_ERR(root)) {
 		ext4_msg(sb, KERN_ERR, "get root inode failed");
-		ret = PTR_ERR(root);
+		err = PTR_ERR(root);
 		root = NULL;
 		goto failed_mount4;
 	}
 	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
 		ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
 		iput(root);
+		err = -EFSCORRUPTED;
 		goto failed_mount4;
 	}
 
 	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
 		ext4_msg(sb, KERN_ERR, "get root dentry failed");
-		ret = -ENOMEM;
+		err = -ENOMEM;
 		goto failed_mount4;
 	}
 
-	ret = ext4_setup_super(sb, es, sb_rdonly(sb));
-	if (ret == -EROFS) {
+	err = ext4_setup_super(sb, es, sb_rdonly(sb));
+	if (err == -EROFS) {
 		sb->s_flags |= SB_RDONLY;
-		ret = 0;
-	} else if (ret)
+	} else if (err)
 		goto failed_mount4a;
 
 	ext4_set_resv_clusters(sb);
@@ -5503,7 +5511,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 		sbi->s_journal->j_commit_callback =
 			ext4_journal_commit_callback;
 
-	if (ext4_percpu_param_init(sbi))
+	err = ext4_percpu_param_init(sbi);
+	if (err)
 		goto failed_mount6;
 
 	if (ext4_has_feature_flex_bg(sb))
@@ -5511,7 +5520,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 			ext4_msg(sb, KERN_ERR,
 			       "unable to initialize "
 			       "flex_bg meta info!");
-			ret = -ENOMEM;
+			err = -ENOMEM;
 			goto failed_mount6;
 		}
 
@@ -5628,7 +5637,7 @@ failed_mount:
 #endif
 
 #ifdef CONFIG_QUOTA
-	for (i = 0; i < EXT4_MAXQUOTAS; i++)
+	for (unsigned int i = 0; i < EXT4_MAXQUOTAS; i++)
 		kfree(get_qf_name(sb, sbi, i));
 #endif
 	fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
@@ -5637,7 +5646,7 @@ failed_mount:
 	ext4_blkdev_remove(sbi);
 out_fail:
 	sb->s_fs_info = NULL;
-	return err ? err : ret;
+	return err;
 }
 
 static int ext4_fill_super(struct super_block *sb, struct fs_context *fc)
@@ -6565,12 +6574,12 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
 				goto restore_opts;
 
 			sb->s_flags &= ~SB_RDONLY;
-			if (ext4_has_feature_mmp(sb))
-				if (ext4_multi_mount_protect(sb,
-						le64_to_cpu(es->s_mmp_block))) {
-					err = -EROFS;
+			if (ext4_has_feature_mmp(sb)) {
+				err = ext4_multi_mount_protect(sb,
+						le64_to_cpu(es->s_mmp_block));
+				if (err)
 					goto restore_opts;
-				}
+			}
 #ifdef CONFIG_QUOTA
 			enable_quota = 1;
 #endif
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index 3b01247066dd..2f37e1ea3955 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -366,14 +366,16 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode,
 	index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
 
 	folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0);
-	if (!folio || !folio_test_uptodate(folio)) {
+	if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
 		DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
 
-		if (folio)
+		if (!IS_ERR(folio))
 			folio_put(folio);
 		else if (num_ra_pages > 1)
 			page_cache_ra_unbounded(&ractl, num_ra_pages, 0);
 		folio = read_mapping_folio(inode->i_mapping, index, NULL);
+		if (IS_ERR(folio))
+			return ERR_CAST(folio);
 	}
 	return folio_file_page(folio, index);
 }
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 179a5c5e28fd..91e89e68177e 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -623,7 +623,7 @@ static int __init cuse_init(void)
 	/* CUSE is not prepared for FUSE_DEV_IOC_CLONE */
 	cuse_channel_fops.unlocked_ioctl	= NULL;
 
-	cuse_class = class_create(THIS_MODULE, "cuse");
+	cuse_class = class_create("cuse");
 	if (IS_ERR(cuse_class))
 		return PTR_ERR(cuse_class);
 
diff --git a/fs/hostfs/Makefile b/fs/hostfs/Makefile
index 587bcd6e50a3..16be592e8085 100644
--- a/fs/hostfs/Makefile
+++ b/fs/hostfs/Makefile
@@ -3,9 +3,11 @@
 # Licensed under the GPL
 #
 
-hostfs-objs := hostfs_kern.o hostfs_user.o
+hostfs-objs := hostfs_kern.o
 
-obj-y :=
+hostfs-builtin-$(CONFIG_HOSTFS) += hostfs_user.o hostfs_user_exp.o
+
+obj-y := $(hostfs-builtin-y) $(hostfs-builtin-m)
 obj-$(CONFIG_HOSTFS) += hostfs.o
 
 include $(srctree)/arch/um/scripts/Makefile.rules
diff --git a/fs/hostfs/hostfs_user_exp.c b/fs/hostfs/hostfs_user_exp.c
new file mode 100644
index 000000000000..250c91c55c46
--- /dev/null
+++ b/fs/hostfs/hostfs_user_exp.c
@@ -0,0 +1,28 @@
+#include <linux/module.h>
+#include "hostfs.h"
+
+EXPORT_SYMBOL_GPL(stat_file);
+EXPORT_SYMBOL_GPL(access_file);
+EXPORT_SYMBOL_GPL(open_file);
+EXPORT_SYMBOL_GPL(open_dir);
+EXPORT_SYMBOL_GPL(seek_dir);
+EXPORT_SYMBOL_GPL(read_dir);
+EXPORT_SYMBOL_GPL(read_file);
+EXPORT_SYMBOL_GPL(write_file);
+EXPORT_SYMBOL_GPL(lseek_file);
+EXPORT_SYMBOL_GPL(fsync_file);
+EXPORT_SYMBOL_GPL(replace_file);
+EXPORT_SYMBOL_GPL(close_file);
+EXPORT_SYMBOL_GPL(close_dir);
+EXPORT_SYMBOL_GPL(file_create);
+EXPORT_SYMBOL_GPL(set_attr);
+EXPORT_SYMBOL_GPL(make_symlink);
+EXPORT_SYMBOL_GPL(unlink_file);
+EXPORT_SYMBOL_GPL(do_mkdir);
+EXPORT_SYMBOL_GPL(hostfs_do_rmdir);
+EXPORT_SYMBOL_GPL(do_mknod);
+EXPORT_SYMBOL_GPL(link_file);
+EXPORT_SYMBOL_GPL(hostfs_do_readlink);
+EXPORT_SYMBOL_GPL(rename_file);
+EXPORT_SYMBOL_GPL(rename2_file);
+EXPORT_SYMBOL_GPL(do_statfs);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9062da6da567..ecfdfb2529a3 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -208,7 +208,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
 
 	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
 	info.length = len;
-	info.low_limit = max(PAGE_SIZE, mmap_min_addr);
+	info.low_limit = PAGE_SIZE;
 	info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
 	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
 	info.align_offset = 0;
@@ -697,7 +697,7 @@ static void hugetlbfs_zero_partial_page(struct hstate *h,
 	struct folio *folio;
 
 	folio = filemap_lock_folio(mapping, idx);
-	if (!folio)
+	if (IS_ERR(folio))
 		return;
 
 	start = start & ~huge_page_mask(h);
diff --git a/fs/inode.c b/fs/inode.c
index 3ec5a8f7b644..577799b7855f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -864,8 +864,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
 				__count_vm_events(KSWAPD_INODESTEAL, reap);
 			else
 				__count_vm_events(PGINODESTEAL, reap);
-			if (current->reclaim_state)
-				current->reclaim_state->reclaimed_slab += reap;
+			mm_account_reclaimed_pages(reap);
 		}
 		iput(inode);
 		spin_lock(lru_lock);
diff --git a/fs/internal.h b/fs/internal.h
index ab36ed8fa41c..bd3b2810a36b 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -59,8 +59,6 @@ extern int finish_clean_context(struct fs_context *fc);
  */
 extern int filename_lookup(int dfd, struct filename *name, unsigned flags,
 			   struct path *path, struct path *root);
-extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
-			   const char *, unsigned int, struct path *);
 int do_rmdir(int dfd, struct filename *name);
 int do_unlinkat(int dfd, struct filename *name);
 int may_linkat(struct mnt_idmap *idmap, const struct path *link);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 10a203515583..063133ec77f4 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -468,19 +468,12 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
 struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
 {
 	unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS;
-	struct folio *folio;
 
 	if (iter->flags & IOMAP_NOWAIT)
 		fgp |= FGP_NOWAIT;
 
-	folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
+	return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
 			fgp, mapping_gfp_mask(iter->inode->i_mapping));
-	if (folio)
-		return folio;
-
-	if (iter->flags & IOMAP_NOWAIT)
-		return ERR_PTR(-EAGAIN);
-	return ERR_PTR(-ENOMEM);
 }
 EXPORT_SYMBOL_GPL(iomap_get_folio);
 
@@ -911,7 +904,7 @@ static int iomap_write_delalloc_scan(struct inode *inode,
 		/* grab locked page */
 		folio = filemap_lock_folio(inode->i_mapping,
 				start_byte >> PAGE_SHIFT);
-		if (!folio) {
+		if (IS_ERR(folio)) {
 			start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) +
 					PAGE_SIZE;
 			continue;
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index f771001574d0..019cc87d0fb3 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -130,6 +130,7 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
 	if (ret > 0)
 		ret += dio->done_before;
 
+	trace_iomap_dio_complete(iocb, dio->error, ret);
 	kfree(dio);
 
 	return ret;
@@ -493,6 +494,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	struct blk_plug plug;
 	struct iomap_dio *dio;
 
+	trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before);
+
 	if (!iomi.len)
 		return NULL;
 
@@ -541,7 +544,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		}
 
 		/* for data sync or sync, we need sync completion processing */
-		if (iocb_is_dsync(iocb) && !(dio_flags & IOMAP_DIO_NOSYNC)) {
+		if (iocb_is_dsync(iocb)) {
 			dio->flags |= IOMAP_DIO_NEED_SYNC;
 
 		       /*
@@ -650,8 +653,10 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	 */
 	dio->wait_for_completion = wait_for_completion;
 	if (!atomic_dec_and_test(&dio->ref)) {
-		if (!wait_for_completion)
+		if (!wait_for_completion) {
+			trace_iomap_dio_rw_queued(inode, iomi.pos, iomi.len);
 			return ERR_PTR(-EIOCBQUEUED);
+		}
 
 		for (;;) {
 			set_current_state(TASK_UNINTERRUPTIBLE);
diff --git a/fs/iomap/trace.c b/fs/iomap/trace.c
index da217246b1a9..728d5443daf5 100644
--- a/fs/iomap/trace.c
+++ b/fs/iomap/trace.c
@@ -3,6 +3,7 @@
  * Copyright (c) 2019 Christoph Hellwig
  */
 #include <linux/iomap.h>
+#include <linux/uio.h>
 
 /*
  * We include this last to have the helpers above available for the trace
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index f6ea9540d082..c16fd55f5595 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -83,6 +83,7 @@ DEFINE_RANGE_EVENT(iomap_writepage);
 DEFINE_RANGE_EVENT(iomap_release_folio);
 DEFINE_RANGE_EVENT(iomap_invalidate_folio);
 DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail);
+DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
 
 #define IOMAP_TYPE_STRINGS \
 	{ IOMAP_HOLE,		"HOLE" }, \
@@ -107,6 +108,11 @@ DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail);
 	{ IOMAP_F_BUFFER_HEAD,	"BH" }, \
 	{ IOMAP_F_SIZE_CHANGED,	"SIZE_CHANGED" }
 
+#define IOMAP_DIO_STRINGS \
+	{IOMAP_DIO_FORCE_WAIT,	"DIO_FORCE_WAIT" }, \
+	{IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \
+	{IOMAP_DIO_PARTIAL,	"DIO_PARTIAL" }
+
 DECLARE_EVENT_CLASS(iomap_class,
 	TP_PROTO(struct inode *inode, struct iomap *iomap),
 	TP_ARGS(inode, iomap),
@@ -183,6 +189,78 @@ TRACE_EVENT(iomap_iter,
 		   (void *)__entry->caller)
 );
 
+TRACE_EVENT(iomap_dio_rw_begin,
+	TP_PROTO(struct kiocb *iocb, struct iov_iter *iter,
+		 unsigned int dio_flags, size_t done_before),
+	TP_ARGS(iocb, iter, dio_flags, done_before),
+	TP_STRUCT__entry(
+		__field(dev_t,	dev)
+		__field(ino_t,	ino)
+		__field(loff_t, isize)
+		__field(loff_t, pos)
+		__field(size_t,	count)
+		__field(size_t,	done_before)
+		__field(int,	ki_flags)
+		__field(unsigned int,	dio_flags)
+		__field(bool,	aio)
+	),
+	TP_fast_assign(
+		__entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev;
+		__entry->ino = file_inode(iocb->ki_filp)->i_ino;
+		__entry->isize = file_inode(iocb->ki_filp)->i_size;
+		__entry->pos = iocb->ki_pos;
+		__entry->count = iov_iter_count(iter);
+		__entry->done_before = done_before;
+		__entry->ki_flags = iocb->ki_flags;
+		__entry->dio_flags = dio_flags;
+		__entry->aio = !is_sync_kiocb(iocb);
+	),
+	TP_printk("dev %d:%d ino 0x%lx size 0x%llx offset 0x%llx length 0x%zx done_before 0x%zx flags %s dio_flags %s aio %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->isize,
+		  __entry->pos,
+		  __entry->count,
+		  __entry->done_before,
+		  __print_flags(__entry->ki_flags, "|", TRACE_IOCB_STRINGS),
+		  __print_flags(__entry->dio_flags, "|", IOMAP_DIO_STRINGS),
+		  __entry->aio)
+);
+
+TRACE_EVENT(iomap_dio_complete,
+	TP_PROTO(struct kiocb *iocb, int error, ssize_t ret),
+	TP_ARGS(iocb, error, ret),
+	TP_STRUCT__entry(
+		__field(dev_t,	dev)
+		__field(ino_t,	ino)
+		__field(loff_t, isize)
+		__field(loff_t, pos)
+		__field(int,	ki_flags)
+		__field(bool,	aio)
+		__field(int,	error)
+		__field(ssize_t, ret)
+	),
+	TP_fast_assign(
+		__entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev;
+		__entry->ino = file_inode(iocb->ki_filp)->i_ino;
+		__entry->isize = file_inode(iocb->ki_filp)->i_size;
+		__entry->pos = iocb->ki_pos;
+		__entry->ki_flags = iocb->ki_flags;
+		__entry->aio = !is_sync_kiocb(iocb);
+		__entry->error = error;
+		__entry->ret = ret;
+	),
+	TP_printk("dev %d:%d ino 0x%lx size 0x%llx offset 0x%llx flags %s aio %d error %d ret %zd",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->isize,
+		  __entry->pos,
+		  __print_flags(__entry->ki_flags, "|", TRACE_IOCB_STRINGS),
+		  __entry->aio,
+		  __entry->error,
+		  __entry->ret)
+);
+
 #endif /* _IOMAP_TRACE_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 90de0e498371..45b6919903e6 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -17,7 +17,7 @@
 
 #include "kernfs-internal.h"
 
-static DEFINE_SPINLOCK(kernfs_rename_lock);	/* kn->parent and ->name */
+static DEFINE_RWLOCK(kernfs_rename_lock);	/* kn->parent and ->name */
 /*
  * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to
  * call pr_cont() while holding rename_lock. Because sometimes pr_cont()
@@ -196,9 +196,9 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
 	unsigned long flags;
 	int ret;
 
-	spin_lock_irqsave(&kernfs_rename_lock, flags);
+	read_lock_irqsave(&kernfs_rename_lock, flags);
 	ret = kernfs_name_locked(kn, buf, buflen);
-	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+	read_unlock_irqrestore(&kernfs_rename_lock, flags);
 	return ret;
 }
 
@@ -224,9 +224,9 @@ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from,
 	unsigned long flags;
 	int ret;
 
-	spin_lock_irqsave(&kernfs_rename_lock, flags);
+	read_lock_irqsave(&kernfs_rename_lock, flags);
 	ret = kernfs_path_from_node_locked(to, from, buf, buflen);
-	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+	read_unlock_irqrestore(&kernfs_rename_lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(kernfs_path_from_node);
@@ -294,10 +294,10 @@ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
 	struct kernfs_node *parent;
 	unsigned long flags;
 
-	spin_lock_irqsave(&kernfs_rename_lock, flags);
+	read_lock_irqsave(&kernfs_rename_lock, flags);
 	parent = kn->parent;
 	kernfs_get(parent);
-	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+	read_unlock_irqrestore(&kernfs_rename_lock, flags);
 
 	return parent;
 }
@@ -770,12 +770,15 @@ int kernfs_add_one(struct kernfs_node *kn)
 		goto out_unlock;
 
 	/* Update timestamps on the parent */
+	down_write(&root->kernfs_iattr_rwsem);
+
 	ps_iattr = parent->iattr;
 	if (ps_iattr) {
 		ktime_get_real_ts64(&ps_iattr->ia_ctime);
 		ps_iattr->ia_mtime = ps_iattr->ia_ctime;
 	}
 
+	up_write(&root->kernfs_iattr_rwsem);
 	up_write(&root->kernfs_rwsem);
 
 	/*
@@ -940,6 +943,8 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
 
 	idr_init(&root->ino_idr);
 	init_rwsem(&root->kernfs_rwsem);
+	init_rwsem(&root->kernfs_iattr_rwsem);
+	init_rwsem(&root->kernfs_supers_rwsem);
 	INIT_LIST_HEAD(&root->supers);
 
 	/*
@@ -1462,11 +1467,14 @@ static void __kernfs_remove(struct kernfs_node *kn)
 				pos->parent ? pos->parent->iattr : NULL;
 
 			/* update timestamps on the parent */
+			down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
+
 			if (ps_iattr) {
 				ktime_get_real_ts64(&ps_iattr->ia_ctime);
 				ps_iattr->ia_mtime = ps_iattr->ia_ctime;
 			}
 
+			up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
 			kernfs_put(pos);
 		}
 
@@ -1723,7 +1731,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 	kernfs_get(new_parent);
 
 	/* rename_lock protects ->parent and ->name accessors */
-	spin_lock_irq(&kernfs_rename_lock);
+	write_lock_irq(&kernfs_rename_lock);
 
 	old_parent = kn->parent;
 	kn->parent = new_parent;
@@ -1734,7 +1742,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 		kn->name = new_name;
 	}
 
-	spin_unlock_irq(&kernfs_rename_lock);
+	write_unlock_irq(&kernfs_rename_lock);
 
 	kn->hash = kernfs_name_hash(kn->name, kn->ns);
 	kernfs_link_sibling(kn);
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index e4a50e4ff0d2..40c4661f15b7 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -922,8 +922,8 @@ repeat:
 
 	root = kernfs_root(kn);
 	/* kick fsnotify */
-	down_write(&root->kernfs_rwsem);
 
+	down_read(&root->kernfs_supers_rwsem);
 	list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
 		struct kernfs_node *parent;
 		struct inode *p_inode = NULL;
@@ -960,7 +960,7 @@ repeat:
 		iput(inode);
 	}
 
-	up_write(&root->kernfs_rwsem);
+	up_read(&root->kernfs_supers_rwsem);
 	kernfs_put(kn);
 	goto repeat;
 }
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 30494dcb0df3..b22b74d1a115 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -101,9 +101,9 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
 	int ret;
 	struct kernfs_root *root = kernfs_root(kn);
 
-	down_write(&root->kernfs_rwsem);
+	down_write(&root->kernfs_iattr_rwsem);
 	ret = __kernfs_setattr(kn, iattr);
-	up_write(&root->kernfs_rwsem);
+	up_write(&root->kernfs_iattr_rwsem);
 	return ret;
 }
 
@@ -119,7 +119,7 @@ int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 		return -EINVAL;
 
 	root = kernfs_root(kn);
-	down_write(&root->kernfs_rwsem);
+	down_write(&root->kernfs_iattr_rwsem);
 	error = setattr_prepare(&nop_mnt_idmap, dentry, iattr);
 	if (error)
 		goto out;
@@ -132,7 +132,7 @@ int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 	setattr_copy(&nop_mnt_idmap, inode, iattr);
 
 out:
-	up_write(&root->kernfs_rwsem);
+	up_write(&root->kernfs_iattr_rwsem);
 	return error;
 }
 
@@ -189,10 +189,10 @@ int kernfs_iop_getattr(struct mnt_idmap *idmap,
 	struct kernfs_node *kn = inode->i_private;
 	struct kernfs_root *root = kernfs_root(kn);
 
-	down_read(&root->kernfs_rwsem);
+	down_read(&root->kernfs_iattr_rwsem);
 	kernfs_refresh_inode(kn, inode);
 	generic_fillattr(&nop_mnt_idmap, inode, stat);
-	up_read(&root->kernfs_rwsem);
+	up_read(&root->kernfs_iattr_rwsem);
 
 	return 0;
 }
@@ -285,10 +285,10 @@ int kernfs_iop_permission(struct mnt_idmap *idmap,
 	kn = inode->i_private;
 	root = kernfs_root(kn);
 
-	down_read(&root->kernfs_rwsem);
+	down_read(&root->kernfs_iattr_rwsem);
 	kernfs_refresh_inode(kn, inode);
 	ret = generic_permission(&nop_mnt_idmap, inode, mask);
-	up_read(&root->kernfs_rwsem);
+	up_read(&root->kernfs_iattr_rwsem);
 
 	return ret;
 }
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 236c3a6113f1..a9b854cdfdb5 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -47,6 +47,8 @@ struct kernfs_root {
 
 	wait_queue_head_t	deactivate_waitq;
 	struct rw_semaphore	kernfs_rwsem;
+	struct rw_semaphore	kernfs_iattr_rwsem;
+	struct rw_semaphore	kernfs_supers_rwsem;
 };
 
 /* +1 to avoid triggering overflow warning when negating it */
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index e08e8d999807..d49606accb07 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -351,9 +351,9 @@ int kernfs_get_tree(struct fs_context *fc)
 		}
 		sb->s_flags |= SB_ACTIVE;
 
-		down_write(&root->kernfs_rwsem);
+		down_write(&root->kernfs_supers_rwsem);
 		list_add(&info->node, &info->root->supers);
-		up_write(&root->kernfs_rwsem);
+		up_write(&root->kernfs_supers_rwsem);
 	}
 
 	fc->root = dget(sb->s_root);
@@ -380,9 +380,9 @@ void kernfs_kill_sb(struct super_block *sb)
 	struct kernfs_super_info *info = kernfs_info(sb);
 	struct kernfs_root *root = info->root;
 
-	down_write(&root->kernfs_rwsem);
+	down_write(&root->kernfs_supers_rwsem);
 	list_del(&info->node);
-	up_write(&root->kernfs_rwsem);
+	up_write(&root->kernfs_supers_rwsem);
 
 	/*
 	 * Remove the superblock from fs_supers/s_instances
diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c
index 0d8242789dc8..0979577fd23e 100644
--- a/fs/ksmbd/server.c
+++ b/fs/ksmbd/server.c
@@ -415,7 +415,7 @@ int server_queue_ctrl_reset_work(void)
 	return __queue_ctrl_work(SERVER_CTRL_TYPE_RESET);
 }
 
-static ssize_t stats_show(struct class *class, struct class_attribute *attr,
+static ssize_t stats_show(const struct class *class, const struct class_attribute *attr,
 			  char *buf)
 {
 	/*
@@ -434,8 +434,8 @@ static ssize_t stats_show(struct class *class, struct class_attribute *attr,
 			  server_conf.ipc_last_active / HZ);
 }
 
-static ssize_t kill_server_store(struct class *class,
-				 struct class_attribute *attr, const char *buf,
+static ssize_t kill_server_store(const struct class *class,
+				 const struct class_attribute *attr, const char *buf,
 				 size_t len)
 {
 	if (!sysfs_streq(buf, "hard"))
@@ -455,7 +455,7 @@ static const char * const debug_type_strings[] = {"smb", "auth", "vfs",
 						  "oplock", "ipc", "conn",
 						  "rdma"};
 
-static ssize_t debug_show(struct class *class, struct class_attribute *attr,
+static ssize_t debug_show(const struct class *class, const struct class_attribute *attr,
 			  char *buf)
 {
 	ssize_t sz = 0;
@@ -473,7 +473,7 @@ static ssize_t debug_show(struct class *class, struct class_attribute *attr,
 	return sz;
 }
 
-static ssize_t debug_store(struct class *class, struct class_attribute *attr,
+static ssize_t debug_store(const struct class *class, const struct class_attribute *attr,
 			   const char *buf, size_t len)
 {
 	int i;
@@ -513,7 +513,6 @@ ATTRIBUTE_GROUPS(ksmbd_control_class);
 
 static struct class ksmbd_control_class = {
 	.name		= "ksmbd-control",
-	.owner		= THIS_MODULE,
 	.class_groups	= ksmbd_control_class_groups,
 };
 
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index 67b7e766a06b..bbc9e92935fb 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -756,19 +756,6 @@ static void build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt,
 	pneg_ctxt->Ciphers[0] = cipher_type;
 }
 
-static void build_compression_ctxt(struct smb2_compression_capabilities_context *pneg_ctxt,
-				   __le16 comp_algo)
-{
-	pneg_ctxt->ContextType = SMB2_COMPRESSION_CAPABILITIES;
-	pneg_ctxt->DataLength =
-		cpu_to_le16(sizeof(struct smb2_compression_capabilities_context)
-			- sizeof(struct smb2_neg_context));
-	pneg_ctxt->Reserved = cpu_to_le32(0);
-	pneg_ctxt->CompressionAlgorithmCount = cpu_to_le16(1);
-	pneg_ctxt->Flags = cpu_to_le32(0);
-	pneg_ctxt->CompressionAlgorithms[0] = comp_algo;
-}
-
 static void build_sign_cap_ctxt(struct smb2_signing_capabilities *pneg_ctxt,
 				__le16 sign_algo)
 {
@@ -808,7 +795,7 @@ static void assemble_neg_contexts(struct ksmbd_conn *conn,
 				  struct smb2_negotiate_rsp *rsp,
 				  void *smb2_buf_len)
 {
-	char *pneg_ctxt = (char *)rsp +
+	char * const pneg_ctxt = (char *)rsp +
 			le32_to_cpu(rsp->NegotiateContextOffset);
 	int neg_ctxt_cnt = 1;
 	int ctxt_size;
@@ -817,61 +804,46 @@ static void assemble_neg_contexts(struct ksmbd_conn *conn,
 		    "assemble SMB2_PREAUTH_INTEGRITY_CAPABILITIES context\n");
 	build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt,
 			   conn->preauth_info->Preauth_HashId);
-	rsp->NegotiateContextCount = cpu_to_le16(neg_ctxt_cnt);
 	inc_rfc1001_len(smb2_buf_len, AUTH_GSS_PADDING);
 	ctxt_size = sizeof(struct smb2_preauth_neg_context);
-	/* Round to 8 byte boundary */
-	pneg_ctxt += round_up(sizeof(struct smb2_preauth_neg_context), 8);
 
 	if (conn->cipher_type) {
+		/* Round to 8 byte boundary */
 		ctxt_size = round_up(ctxt_size, 8);
 		ksmbd_debug(SMB,
 			    "assemble SMB2_ENCRYPTION_CAPABILITIES context\n");
-		build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt,
+		build_encrypt_ctxt((struct smb2_encryption_neg_context *)
+				   (pneg_ctxt + ctxt_size),
 				   conn->cipher_type);
-		rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt);
+		neg_ctxt_cnt++;
 		ctxt_size += sizeof(struct smb2_encryption_neg_context) + 2;
-		/* Round to 8 byte boundary */
-		pneg_ctxt +=
-			round_up(sizeof(struct smb2_encryption_neg_context) + 2,
-				 8);
 	}
 
-	if (conn->compress_algorithm) {
-		ctxt_size = round_up(ctxt_size, 8);
-		ksmbd_debug(SMB,
-			    "assemble SMB2_COMPRESSION_CAPABILITIES context\n");
-		/* Temporarily set to SMB3_COMPRESS_NONE */
-		build_compression_ctxt((struct smb2_compression_capabilities_context *)pneg_ctxt,
-				       conn->compress_algorithm);
-		rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt);
-		ctxt_size += sizeof(struct smb2_compression_capabilities_context) + 2;
-		/* Round to 8 byte boundary */
-		pneg_ctxt += round_up(sizeof(struct smb2_compression_capabilities_context) + 2,
-				      8);
-	}
+	/* compression context not yet supported */
+	WARN_ON(conn->compress_algorithm != SMB3_COMPRESS_NONE);
 
 	if (conn->posix_ext_supported) {
 		ctxt_size = round_up(ctxt_size, 8);
 		ksmbd_debug(SMB,
 			    "assemble SMB2_POSIX_EXTENSIONS_AVAILABLE context\n");
-		build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt);
-		rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt);
+		build_posix_ctxt((struct smb2_posix_neg_context *)
+				 (pneg_ctxt + ctxt_size));
+		neg_ctxt_cnt++;
 		ctxt_size += sizeof(struct smb2_posix_neg_context);
-		/* Round to 8 byte boundary */
-		pneg_ctxt += round_up(sizeof(struct smb2_posix_neg_context), 8);
 	}
 
 	if (conn->signing_negotiated) {
 		ctxt_size = round_up(ctxt_size, 8);
 		ksmbd_debug(SMB,
 			    "assemble SMB2_SIGNING_CAPABILITIES context\n");
-		build_sign_cap_ctxt((struct smb2_signing_capabilities *)pneg_ctxt,
+		build_sign_cap_ctxt((struct smb2_signing_capabilities *)
+				    (pneg_ctxt + ctxt_size),
 				    conn->signing_algorithm);
-		rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt);
+		neg_ctxt_cnt++;
 		ctxt_size += sizeof(struct smb2_signing_capabilities) + 2;
 	}
 
+	rsp->NegotiateContextCount = cpu_to_le16(neg_ctxt_cnt);
 	inc_rfc1001_len(smb2_buf_len, ctxt_size);
 }
 
@@ -2436,7 +2408,7 @@ static int smb2_creat(struct ksmbd_work *work, struct path *path, char *name,
 			return rc;
 	}
 
-	rc = ksmbd_vfs_kern_path(work, name, 0, path, 0);
+	rc = ksmbd_vfs_kern_path_locked(work, name, 0, path, 0);
 	if (rc) {
 		pr_err("cannot get linux path (%s), err = %d\n",
 		       name, rc);
@@ -2727,8 +2699,10 @@ int smb2_open(struct ksmbd_work *work)
 		goto err_out1;
 	}
 
-	rc = ksmbd_vfs_kern_path(work, name, LOOKUP_NO_SYMLINKS, &path, 1);
+	rc = ksmbd_vfs_kern_path_locked(work, name, LOOKUP_NO_SYMLINKS, &path, 1);
 	if (!rc) {
+		file_present = true;
+
 		if (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE) {
 			/*
 			 * If file exists with under flags, return access
@@ -2737,7 +2711,6 @@ int smb2_open(struct ksmbd_work *work)
 			if (req->CreateDisposition == FILE_OVERWRITE_IF_LE ||
 			    req->CreateDisposition == FILE_OPEN_IF_LE) {
 				rc = -EACCES;
-				path_put(&path);
 				goto err_out;
 			}
 
@@ -2745,26 +2718,23 @@ int smb2_open(struct ksmbd_work *work)
 				ksmbd_debug(SMB,
 					    "User does not have write permission\n");
 				rc = -EACCES;
-				path_put(&path);
 				goto err_out;
 			}
 		} else if (d_is_symlink(path.dentry)) {
 			rc = -EACCES;
-			path_put(&path);
 			goto err_out;
 		}
-	}
 
-	if (rc) {
+		file_present = true;
+		idmap = mnt_idmap(path.mnt);
+	} else {
 		if (rc != -ENOENT)
 			goto err_out;
 		ksmbd_debug(SMB, "can not get linux path for %s, rc = %d\n",
 			    name, rc);
 		rc = 0;
-	} else {
-		file_present = true;
-		idmap = mnt_idmap(path.mnt);
 	}
+
 	if (stream_name) {
 		if (req->CreateOptions & FILE_DIRECTORY_FILE_LE) {
 			if (s_type == DATA_STREAM) {
@@ -2892,8 +2862,9 @@ int smb2_open(struct ksmbd_work *work)
 
 			if ((daccess & FILE_DELETE_LE) ||
 			    (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) {
-				rc = ksmbd_vfs_may_delete(idmap,
-							  path.dentry);
+				rc = inode_permission(idmap,
+						      d_inode(path.dentry->d_parent),
+						      MAY_EXEC | MAY_WRITE);
 				if (rc)
 					goto err_out;
 			}
@@ -3264,10 +3235,13 @@ int smb2_open(struct ksmbd_work *work)
 	}
 
 err_out:
-	if (file_present || created)
-		path_put(&path);
+	if (file_present || created) {
+		inode_unlock(d_inode(path.dentry->d_parent));
+		dput(path.dentry);
+	}
 	ksmbd_revert_fsids(work);
 err_out1:
+
 	if (rc) {
 		if (rc == -EINVAL)
 			rsp->hdr.Status = STATUS_INVALID_PARAMETER;
@@ -5418,44 +5392,19 @@ int smb2_echo(struct ksmbd_work *work)
 
 static int smb2_rename(struct ksmbd_work *work,
 		       struct ksmbd_file *fp,
-		       struct mnt_idmap *idmap,
 		       struct smb2_file_rename_info *file_info,
 		       struct nls_table *local_nls)
 {
 	struct ksmbd_share_config *share = fp->tcon->share_conf;
-	char *new_name = NULL, *abs_oldname = NULL, *old_name = NULL;
-	char *pathname = NULL;
-	struct path path;
-	bool file_present = true;
-	int rc;
+	char *new_name = NULL;
+	int rc, flags = 0;
 
 	ksmbd_debug(SMB, "setting FILE_RENAME_INFO\n");
-	pathname = kmalloc(PATH_MAX, GFP_KERNEL);
-	if (!pathname)
-		return -ENOMEM;
-
-	abs_oldname = file_path(fp->filp, pathname, PATH_MAX);
-	if (IS_ERR(abs_oldname)) {
-		rc = -EINVAL;
-		goto out;
-	}
-	old_name = strrchr(abs_oldname, '/');
-	if (old_name && old_name[1] != '\0') {
-		old_name++;
-	} else {
-		ksmbd_debug(SMB, "can't get last component in path %s\n",
-			    abs_oldname);
-		rc = -ENOENT;
-		goto out;
-	}
-
 	new_name = smb2_get_name(file_info->FileName,
 				 le32_to_cpu(file_info->FileNameLength),
 				 local_nls);
-	if (IS_ERR(new_name)) {
-		rc = PTR_ERR(new_name);
-		goto out;
-	}
+	if (IS_ERR(new_name))
+		return PTR_ERR(new_name);
 
 	if (strchr(new_name, ':')) {
 		int s_type;
@@ -5481,7 +5430,7 @@ static int smb2_rename(struct ksmbd_work *work,
 		if (rc)
 			goto out;
 
-		rc = ksmbd_vfs_setxattr(idmap,
+		rc = ksmbd_vfs_setxattr(file_mnt_idmap(fp->filp),
 					fp->filp->f_path.dentry,
 					xattr_stream_name,
 					NULL, 0, 0);
@@ -5496,47 +5445,18 @@ static int smb2_rename(struct ksmbd_work *work,
 	}
 
 	ksmbd_debug(SMB, "new name %s\n", new_name);
-	rc = ksmbd_vfs_kern_path(work, new_name, LOOKUP_NO_SYMLINKS, &path, 1);
-	if (rc) {
-		if (rc != -ENOENT)
-			goto out;
-		file_present = false;
-	} else {
-		path_put(&path);
-	}
-
 	if (ksmbd_share_veto_filename(share, new_name)) {
 		rc = -ENOENT;
 		ksmbd_debug(SMB, "Can't rename vetoed file: %s\n", new_name);
 		goto out;
 	}
 
-	if (file_info->ReplaceIfExists) {
-		if (file_present) {
-			rc = ksmbd_vfs_remove_file(work, new_name);
-			if (rc) {
-				if (rc != -ENOTEMPTY)
-					rc = -EINVAL;
-				ksmbd_debug(SMB, "cannot delete %s, rc %d\n",
-					    new_name, rc);
-				goto out;
-			}
-		}
-	} else {
-		if (file_present &&
-		    strncmp(old_name, path.dentry->d_name.name, strlen(old_name))) {
-			rc = -EEXIST;
-			ksmbd_debug(SMB,
-				    "cannot rename already existing file\n");
-			goto out;
-		}
-	}
+	if (!file_info->ReplaceIfExists)
+		flags = RENAME_NOREPLACE;
 
-	rc = ksmbd_vfs_fp_rename(work, fp, new_name);
+	rc = ksmbd_vfs_rename(work, &fp->filp->f_path, new_name, flags);
 out:
-	kfree(pathname);
-	if (!IS_ERR(new_name))
-		kfree(new_name);
+	kfree(new_name);
 	return rc;
 }
 
@@ -5576,18 +5496,17 @@ static int smb2_create_link(struct ksmbd_work *work,
 	}
 
 	ksmbd_debug(SMB, "target name is %s\n", target_name);
-	rc = ksmbd_vfs_kern_path(work, link_name, LOOKUP_NO_SYMLINKS, &path, 0);
+	rc = ksmbd_vfs_kern_path_locked(work, link_name, LOOKUP_NO_SYMLINKS,
+					&path, 0);
 	if (rc) {
 		if (rc != -ENOENT)
 			goto out;
 		file_present = false;
-	} else {
-		path_put(&path);
 	}
 
 	if (file_info->ReplaceIfExists) {
 		if (file_present) {
-			rc = ksmbd_vfs_remove_file(work, link_name);
+			rc = ksmbd_vfs_remove_file(work, &path);
 			if (rc) {
 				rc = -EINVAL;
 				ksmbd_debug(SMB, "cannot delete %s\n",
@@ -5607,6 +5526,10 @@ static int smb2_create_link(struct ksmbd_work *work,
 	if (rc)
 		rc = -EINVAL;
 out:
+	if (file_present) {
+		inode_unlock(d_inode(path.dentry->d_parent));
+		path_put(&path);
+	}
 	if (!IS_ERR(link_name))
 		kfree(link_name);
 	kfree(pathname);
@@ -5784,12 +5707,6 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp,
 			   struct smb2_file_rename_info *rename_info,
 			   unsigned int buf_len)
 {
-	struct mnt_idmap *idmap;
-	struct ksmbd_file *parent_fp;
-	struct dentry *parent;
-	struct dentry *dentry = fp->filp->f_path.dentry;
-	int ret;
-
 	if (!(fp->daccess & FILE_DELETE_LE)) {
 		pr_err("no right to delete : 0x%x\n", fp->daccess);
 		return -EACCES;
@@ -5799,32 +5716,10 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp,
 			le32_to_cpu(rename_info->FileNameLength))
 		return -EINVAL;
 
-	idmap = file_mnt_idmap(fp->filp);
-	if (ksmbd_stream_fd(fp))
-		goto next;
-
-	parent = dget_parent(dentry);
-	ret = ksmbd_vfs_lock_parent(idmap, parent, dentry);
-	if (ret) {
-		dput(parent);
-		return ret;
-	}
-
-	parent_fp = ksmbd_lookup_fd_inode(d_inode(parent));
-	inode_unlock(d_inode(parent));
-	dput(parent);
+	if (!le32_to_cpu(rename_info->FileNameLength))
+		return -EINVAL;
 
-	if (parent_fp) {
-		if (parent_fp->daccess & FILE_DELETE_LE) {
-			pr_err("parent dir is opened with delete access\n");
-			ksmbd_fd_put(work, parent_fp);
-			return -ESHARE;
-		}
-		ksmbd_fd_put(work, parent_fp);
-	}
-next:
-	return smb2_rename(work, fp, idmap, rename_info,
-			   work->conn->local_nls);
+	return smb2_rename(work, fp, rename_info, work->conn->local_nls);
 }
 
 static int set_file_disposition_info(struct ksmbd_file *fp,
diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h
index 9420dd2813fb..67dc552f2ef7 100644
--- a/fs/ksmbd/smb2pdu.h
+++ b/fs/ksmbd/smb2pdu.h
@@ -70,18 +70,6 @@ struct create_durable_req_v2 {
 	__u8 CreateGuid[16];
 } __packed;
 
-struct create_durable_reconn_req {
-	struct create_context ccontext;
-	__u8   Name[8];
-	union {
-		__u8  Reserved[16];
-		struct {
-			__u64 PersistentFileId;
-			__u64 VolatileFileId;
-		} Fid;
-	} Data;
-} __packed;
-
 struct create_durable_reconn_v2_req {
 	struct create_context ccontext;
 	__u8   Name[8];
@@ -109,12 +97,6 @@ struct create_app_inst_id_vers {
 	__le64 AppInstanceVersionLow;
 } __packed;
 
-struct create_mxac_req {
-	struct create_context ccontext;
-	__u8   Name[8];
-	__le64 Timestamp;
-} __packed;
-
 struct create_alloc_size_req {
 	struct create_context ccontext;
 	__u8   Name[8];
@@ -137,21 +119,6 @@ struct create_durable_v2_rsp {
 	__le32 Flags;
 } __packed;
 
-struct create_mxac_rsp {
-	struct create_context ccontext;
-	__u8   Name[8];
-	__le32 QueryStatus;
-	__le32 MaximalAccess;
-} __packed;
-
-struct create_disk_id_rsp {
-	struct create_context ccontext;
-	__u8   Name[8];
-	__le64 DiskFileId;
-	__le64 VolumeId;
-	__u8  Reserved[16];
-} __packed;
-
 /* equivalent of the contents of SMB3.1.1 POSIX open context response */
 struct create_posix_rsp {
 	struct create_context ccontext;
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index 5ea9229dad2c..778c152708e4 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -18,8 +18,7 @@
 #include <linux/vmalloc.h>
 #include <linux/sched/xacct.h>
 #include <linux/crc32c.h>
-
-#include "../internal.h"	/* for vfs_path_lookup */
+#include <linux/namei.h>
 
 #include "glob.h"
 #include "oplock.h"
@@ -37,19 +36,6 @@
 #include "mgmt/user_session.h"
 #include "mgmt/user_config.h"
 
-static char *extract_last_component(char *path)
-{
-	char *p = strrchr(path, '/');
-
-	if (p && p[1] != '\0') {
-		*p = '\0';
-		p++;
-	} else {
-		p = NULL;
-	}
-	return p;
-}
-
 static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work,
 				    struct inode *parent_inode,
 				    struct inode *inode)
@@ -63,65 +49,77 @@ static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work,
 
 /**
  * ksmbd_vfs_lock_parent() - lock parent dentry if it is stable
- *
- * the parent dentry got by dget_parent or @parent could be
- * unstable, we try to lock a parent inode and lookup the
- * child dentry again.
- *
- * the reference count of @parent isn't incremented.
  */
-int ksmbd_vfs_lock_parent(struct mnt_idmap *idmap, struct dentry *parent,
-			  struct dentry *child)
+int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child)
 {
-	struct dentry *dentry;
-	int ret = 0;
-
 	inode_lock_nested(d_inode(parent), I_MUTEX_PARENT);
-	dentry = lookup_one(idmap, child->d_name.name, parent,
-			    child->d_name.len);
-	if (IS_ERR(dentry)) {
-		ret = PTR_ERR(dentry);
-		goto out_err;
-	}
-
-	if (dentry != child) {
-		ret = -ESTALE;
-		dput(dentry);
-		goto out_err;
+	if (child->d_parent != parent) {
+		inode_unlock(d_inode(parent));
+		return -ENOENT;
 	}
 
-	dput(dentry);
 	return 0;
-out_err:
-	inode_unlock(d_inode(parent));
-	return ret;
 }
 
-int ksmbd_vfs_may_delete(struct mnt_idmap *idmap,
-			 struct dentry *dentry)
+static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf,
+					char *pathname, unsigned int flags,
+					struct path *path)
 {
-	struct dentry *parent;
-	int ret;
+	struct qstr last;
+	struct filename *filename;
+	struct path *root_share_path = &share_conf->vfs_path;
+	int err, type;
+	struct path parent_path;
+	struct dentry *d;
+
+	if (pathname[0] == '\0') {
+		pathname = share_conf->path;
+		root_share_path = NULL;
+	} else {
+		flags |= LOOKUP_BENEATH;
+	}
+
+	filename = getname_kernel(pathname);
+	if (IS_ERR(filename))
+		return PTR_ERR(filename);
+
+	err = vfs_path_parent_lookup(filename, flags,
+				     &parent_path, &last, &type,
+				     root_share_path);
+	putname(filename);
+	if (err)
+		return err;
 
-	parent = dget_parent(dentry);
-	ret = ksmbd_vfs_lock_parent(idmap, parent, dentry);
-	if (ret) {
-		dput(parent);
-		return ret;
+	if (unlikely(type != LAST_NORM)) {
+		path_put(&parent_path);
+		return -ENOENT;
 	}
 
-	ret = inode_permission(idmap, d_inode(parent),
-			       MAY_EXEC | MAY_WRITE);
+	inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT);
+	d = lookup_one_qstr_excl(&last, parent_path.dentry, 0);
+	if (IS_ERR(d))
+		goto err_out;
 
-	inode_unlock(d_inode(parent));
-	dput(parent);
-	return ret;
+	if (d_is_negative(d)) {
+		dput(d);
+		goto err_out;
+	}
+
+	path->dentry = d;
+	path->mnt = share_conf->vfs_path.mnt;
+	path_put(&parent_path);
+
+	return 0;
+
+err_out:
+	inode_unlock(parent_path.dentry->d_inode);
+	path_put(&parent_path);
+	return -ENOENT;
 }
 
 int ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap,
 				   struct dentry *dentry, __le32 *daccess)
 {
-	struct dentry *parent;
 	int ret = 0;
 
 	*daccess = cpu_to_le32(FILE_READ_ATTRIBUTES | READ_CONTROL);
@@ -138,18 +136,9 @@ int ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap,
 	if (!inode_permission(idmap, d_inode(dentry), MAY_OPEN | MAY_EXEC))
 		*daccess |= FILE_EXECUTE_LE;
 
-	parent = dget_parent(dentry);
-	ret = ksmbd_vfs_lock_parent(idmap, parent, dentry);
-	if (ret) {
-		dput(parent);
-		return ret;
-	}
-
-	if (!inode_permission(idmap, d_inode(parent), MAY_EXEC | MAY_WRITE))
+	if (!inode_permission(idmap, d_inode(dentry->d_parent), MAY_EXEC | MAY_WRITE))
 		*daccess |= FILE_DELETE_LE;
 
-	inode_unlock(d_inode(parent));
-	dput(parent);
 	return ret;
 }
 
@@ -582,54 +571,32 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id)
  *
  * Return:	0 on success, otherwise error
  */
-int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name)
+int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path)
 {
 	struct mnt_idmap *idmap;
-	struct path path;
-	struct dentry *parent;
+	struct dentry *parent = path->dentry->d_parent;
 	int err;
 
 	if (ksmbd_override_fsids(work))
 		return -ENOMEM;
 
-	err = ksmbd_vfs_kern_path(work, name, LOOKUP_NO_SYMLINKS, &path, false);
-	if (err) {
-		ksmbd_debug(VFS, "can't get %s, err %d\n", name, err);
-		ksmbd_revert_fsids(work);
-		return err;
-	}
-
-	idmap = mnt_idmap(path.mnt);
-	parent = dget_parent(path.dentry);
-	err = ksmbd_vfs_lock_parent(idmap, parent, path.dentry);
-	if (err) {
-		dput(parent);
-		path_put(&path);
-		ksmbd_revert_fsids(work);
-		return err;
-	}
-
-	if (!d_inode(path.dentry)->i_nlink) {
+	if (!d_inode(path->dentry)->i_nlink) {
 		err = -ENOENT;
 		goto out_err;
 	}
 
-	if (S_ISDIR(d_inode(path.dentry)->i_mode)) {
-		err = vfs_rmdir(idmap, d_inode(parent), path.dentry);
+	idmap = mnt_idmap(path->mnt);
+	if (S_ISDIR(d_inode(path->dentry)->i_mode)) {
+		err = vfs_rmdir(idmap, d_inode(parent), path->dentry);
 		if (err && err != -ENOTEMPTY)
-			ksmbd_debug(VFS, "%s: rmdir failed, err %d\n", name,
-				    err);
+			ksmbd_debug(VFS, "rmdir failed, err %d\n", err);
 	} else {
-		err = vfs_unlink(idmap, d_inode(parent), path.dentry, NULL);
+		err = vfs_unlink(idmap, d_inode(parent), path->dentry, NULL);
 		if (err)
-			ksmbd_debug(VFS, "%s: unlink failed, err %d\n", name,
-				    err);
+			ksmbd_debug(VFS, "unlink failed, err %d\n", err);
 	}
 
 out_err:
-	inode_unlock(d_inode(parent));
-	dput(parent);
-	path_put(&path);
 	ksmbd_revert_fsids(work);
 	return err;
 }
@@ -688,149 +655,114 @@ out1:
 	return err;
 }
 
-static int ksmbd_validate_entry_in_use(struct dentry *src_dent)
+int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path,
+		     char *newname, int flags)
 {
-	struct dentry *dst_dent;
-
-	spin_lock(&src_dent->d_lock);
-	list_for_each_entry(dst_dent, &src_dent->d_subdirs, d_child) {
-		struct ksmbd_file *child_fp;
+	struct dentry *old_parent, *new_dentry, *trap;
+	struct dentry *old_child = old_path->dentry;
+	struct path new_path;
+	struct qstr new_last;
+	struct renamedata rd;
+	struct filename *to;
+	struct ksmbd_share_config *share_conf = work->tcon->share_conf;
+	struct ksmbd_file *parent_fp;
+	int new_type;
+	int err, lookup_flags = LOOKUP_NO_SYMLINKS;
 
-		if (d_really_is_negative(dst_dent))
-			continue;
+	if (ksmbd_override_fsids(work))
+		return -ENOMEM;
 
-		child_fp = ksmbd_lookup_fd_inode(d_inode(dst_dent));
-		if (child_fp) {
-			spin_unlock(&src_dent->d_lock);
-			ksmbd_debug(VFS, "Forbid rename, sub file/dir is in use\n");
-			return -EACCES;
-		}
+	to = getname_kernel(newname);
+	if (IS_ERR(to)) {
+		err = PTR_ERR(to);
+		goto revert_fsids;
 	}
-	spin_unlock(&src_dent->d_lock);
 
-	return 0;
-}
+retry:
+	err = vfs_path_parent_lookup(to, lookup_flags | LOOKUP_BENEATH,
+				     &new_path, &new_last, &new_type,
+				     &share_conf->vfs_path);
+	if (err)
+		goto out1;
 
-static int __ksmbd_vfs_rename(struct ksmbd_work *work,
-			      struct mnt_idmap *src_idmap,
-			      struct dentry *src_dent_parent,
-			      struct dentry *src_dent,
-			      struct mnt_idmap *dst_idmap,
-			      struct dentry *dst_dent_parent,
-			      struct dentry *trap_dent,
-			      char *dst_name)
-{
-	struct dentry *dst_dent;
-	int err;
+	if (old_path->mnt != new_path.mnt) {
+		err = -EXDEV;
+		goto out2;
+	}
 
-	if (!work->tcon->posix_extensions) {
-		err = ksmbd_validate_entry_in_use(src_dent);
-		if (err)
-			return err;
+	trap = lock_rename_child(old_child, new_path.dentry);
+
+	old_parent = dget(old_child->d_parent);
+	if (d_unhashed(old_child)) {
+		err = -EINVAL;
+		goto out3;
 	}
 
-	if (d_really_is_negative(src_dent_parent))
-		return -ENOENT;
-	if (d_really_is_negative(dst_dent_parent))
-		return -ENOENT;
-	if (d_really_is_negative(src_dent))
-		return -ENOENT;
-	if (src_dent == trap_dent)
-		return -EINVAL;
+	parent_fp = ksmbd_lookup_fd_inode(d_inode(old_child->d_parent));
+	if (parent_fp) {
+		if (parent_fp->daccess & FILE_DELETE_LE) {
+			pr_err("parent dir is opened with delete access\n");
+			err = -ESHARE;
+			ksmbd_fd_put(work, parent_fp);
+			goto out3;
+		}
+		ksmbd_fd_put(work, parent_fp);
+	}
 
-	if (ksmbd_override_fsids(work))
-		return -ENOMEM;
+	new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry,
+					  lookup_flags | LOOKUP_RENAME_TARGET);
+	if (IS_ERR(new_dentry)) {
+		err = PTR_ERR(new_dentry);
+		goto out3;
+	}
 
-	dst_dent = lookup_one(dst_idmap, dst_name,
-			      dst_dent_parent, strlen(dst_name));
-	err = PTR_ERR(dst_dent);
-	if (IS_ERR(dst_dent)) {
-		pr_err("lookup failed %s [%d]\n", dst_name, err);
-		goto out;
+	if (d_is_symlink(new_dentry)) {
+		err = -EACCES;
+		goto out4;
 	}
 
-	err = -ENOTEMPTY;
-	if (dst_dent != trap_dent && !d_really_is_positive(dst_dent)) {
-		struct renamedata rd = {
-			.old_mnt_idmap	= src_idmap,
-			.old_dir	= d_inode(src_dent_parent),
-			.old_dentry	= src_dent,
-			.new_mnt_idmap	= dst_idmap,
-			.new_dir	= d_inode(dst_dent_parent),
-			.new_dentry	= dst_dent,
-		};
-		err = vfs_rename(&rd);
+	if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) {
+		err = -EEXIST;
+		goto out4;
 	}
-	if (err)
-		pr_err("vfs_rename failed err %d\n", err);
-	if (dst_dent)
-		dput(dst_dent);
-out:
-	ksmbd_revert_fsids(work);
-	return err;
-}
 
-int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp,
-			char *newname)
-{
-	struct mnt_idmap *idmap;
-	struct path dst_path;
-	struct dentry *src_dent_parent, *dst_dent_parent;
-	struct dentry *src_dent, *trap_dent, *src_child;
-	char *dst_name;
-	int err;
+	if (old_child == trap) {
+		err = -EINVAL;
+		goto out4;
+	}
 
-	dst_name = extract_last_component(newname);
-	if (!dst_name) {
-		dst_name = newname;
-		newname = "";
+	if (new_dentry == trap) {
+		err = -ENOTEMPTY;
+		goto out4;
 	}
 
-	src_dent_parent = dget_parent(fp->filp->f_path.dentry);
-	src_dent = fp->filp->f_path.dentry;
+	rd.old_mnt_idmap	= mnt_idmap(old_path->mnt),
+	rd.old_dir		= d_inode(old_parent),
+	rd.old_dentry		= old_child,
+	rd.new_mnt_idmap	= mnt_idmap(new_path.mnt),
+	rd.new_dir		= new_path.dentry->d_inode,
+	rd.new_dentry		= new_dentry,
+	rd.flags		= flags,
+	err = vfs_rename(&rd);
+	if (err)
+		ksmbd_debug(VFS, "vfs_rename failed err %d\n", err);
 
-	err = ksmbd_vfs_kern_path(work, newname,
-				  LOOKUP_NO_SYMLINKS | LOOKUP_DIRECTORY,
-				  &dst_path, false);
-	if (err) {
-		ksmbd_debug(VFS, "Cannot get path for %s [%d]\n", newname, err);
-		goto out;
+out4:
+	dput(new_dentry);
+out3:
+	dput(old_parent);
+	unlock_rename(old_parent, new_path.dentry);
+out2:
+	path_put(&new_path);
+
+	if (retry_estale(err, lookup_flags)) {
+		lookup_flags |= LOOKUP_REVAL;
+		goto retry;
 	}
-	dst_dent_parent = dst_path.dentry;
-
-	trap_dent = lock_rename(src_dent_parent, dst_dent_parent);
-	dget(src_dent);
-	dget(dst_dent_parent);
-	idmap = file_mnt_idmap(fp->filp);
-	src_child = lookup_one(idmap, src_dent->d_name.name, src_dent_parent,
-			       src_dent->d_name.len);
-	if (IS_ERR(src_child)) {
-		err = PTR_ERR(src_child);
-		goto out_lock;
-	}
-
-	if (src_child != src_dent) {
-		err = -ESTALE;
-		dput(src_child);
-		goto out_lock;
-	}
-	dput(src_child);
-
-	err = __ksmbd_vfs_rename(work,
-				 idmap,
-				 src_dent_parent,
-				 src_dent,
-				 mnt_idmap(dst_path.mnt),
-				 dst_dent_parent,
-				 trap_dent,
-				 dst_name);
-out_lock:
-	dput(src_dent);
-	dput(dst_dent_parent);
-	unlock_rename(src_dent_parent, dst_dent_parent);
-	path_put(&dst_path);
-out:
-	dput(src_dent_parent);
+out1:
+	putname(to);
+revert_fsids:
+	ksmbd_revert_fsids(work);
 	return err;
 }
 
@@ -1081,14 +1013,16 @@ int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap,
 	return vfs_removexattr(idmap, dentry, attr_name);
 }
 
-int ksmbd_vfs_unlink(struct mnt_idmap *idmap,
-		     struct dentry *dir, struct dentry *dentry)
+int ksmbd_vfs_unlink(struct file *filp)
 {
 	int err = 0;
+	struct dentry *dir, *dentry = filp->f_path.dentry;
+	struct mnt_idmap *idmap = file_mnt_idmap(filp);
 
-	err = ksmbd_vfs_lock_parent(idmap, dir, dentry);
+	dir = dget_parent(dentry);
+	err = ksmbd_vfs_lock_parent(dir, dentry);
 	if (err)
-		return err;
+		goto out;
 	dget(dentry);
 
 	if (S_ISDIR(d_inode(dentry)->i_mode))
@@ -1100,6 +1034,8 @@ int ksmbd_vfs_unlink(struct mnt_idmap *idmap,
 	inode_unlock(d_inode(dir));
 	if (err)
 		ksmbd_debug(VFS, "failed to delete, err %d\n", err);
+out:
+	dput(dir);
 
 	return err;
 }
@@ -1202,7 +1138,7 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name,
 }
 
 /**
- * ksmbd_vfs_kern_path() - lookup a file and get path info
+ * ksmbd_vfs_kern_path_locked() - lookup a file and get path info
  * @name:	file path that is relative to share
  * @flags:	lookup flags
  * @path:	if lookup succeed, return path info
@@ -1210,24 +1146,20 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name,
  *
  * Return:	0 on success, otherwise error
  */
-int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name,
-			unsigned int flags, struct path *path, bool caseless)
+int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
+			       unsigned int flags, struct path *path,
+			       bool caseless)
 {
 	struct ksmbd_share_config *share_conf = work->tcon->share_conf;
 	int err;
+	struct path parent_path;
 
-	flags |= LOOKUP_BENEATH;
-	err = vfs_path_lookup(share_conf->vfs_path.dentry,
-			      share_conf->vfs_path.mnt,
-			      name,
-			      flags,
-			      path);
+	err = ksmbd_vfs_path_lookup_locked(share_conf, name, flags, path);
 	if (!err)
-		return 0;
+		return err;
 
 	if (caseless) {
 		char *filepath;
-		struct path parent;
 		size_t path_len, remain_len;
 
 		filepath = kstrdup(name, GFP_KERNEL);
@@ -1237,10 +1169,10 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name,
 		path_len = strlen(filepath);
 		remain_len = path_len;
 
-		parent = share_conf->vfs_path;
-		path_get(&parent);
+		parent_path = share_conf->vfs_path;
+		path_get(&parent_path);
 
-		while (d_can_lookup(parent.dentry)) {
+		while (d_can_lookup(parent_path.dentry)) {
 			char *filename = filepath + path_len - remain_len;
 			char *next = strchrnul(filename, '/');
 			size_t filename_len = next - filename;
@@ -1249,12 +1181,11 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name,
 			if (filename_len == 0)
 				break;
 
-			err = ksmbd_vfs_lookup_in_dir(&parent, filename,
+			err = ksmbd_vfs_lookup_in_dir(&parent_path, filename,
 						      filename_len,
 						      work->conn->um);
-			path_put(&parent);
 			if (err)
-				goto out;
+				goto out2;
 
 			next[0] = '\0';
 
@@ -1262,23 +1193,31 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name,
 					      share_conf->vfs_path.mnt,
 					      filepath,
 					      flags,
-					      &parent);
+					      path);
 			if (err)
-				goto out;
-			else if (is_last) {
-				*path = parent;
-				goto out;
-			}
+				goto out2;
+			else if (is_last)
+				goto out1;
+			path_put(&parent_path);
+			parent_path = *path;
 
 			next[0] = '/';
 			remain_len -= filename_len + 1;
 		}
 
-		path_put(&parent);
 		err = -EINVAL;
-out:
+out2:
+		path_put(&parent_path);
+out1:
 		kfree(filepath);
 	}
+
+	if (!err) {
+		err = ksmbd_vfs_lock_parent(parent_path.dentry, path->dentry);
+		if (err)
+			dput(path->dentry);
+		path_put(&parent_path);
+	}
 	return err;
 }
 
diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h
index 9d676ab0cd25..a4ae89f3230d 100644
--- a/fs/ksmbd/vfs.h
+++ b/fs/ksmbd/vfs.h
@@ -71,9 +71,7 @@ struct ksmbd_kstat {
 	__le32			file_attributes;
 };
 
-int ksmbd_vfs_lock_parent(struct mnt_idmap *idmap, struct dentry *parent,
-			  struct dentry *child);
-int ksmbd_vfs_may_delete(struct mnt_idmap *idmap, struct dentry *dentry);
+int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child);
 int ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap,
 				   struct dentry *dentry, __le32 *daccess);
 int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode);
@@ -84,12 +82,12 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp,
 		    char *buf, size_t count, loff_t *pos, bool sync,
 		    ssize_t *written);
 int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id);
-int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name);
+int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path);
 int ksmbd_vfs_link(struct ksmbd_work *work,
 		   const char *oldname, const char *newname);
 int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat);
-int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp,
-			char *newname);
+int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path,
+		     char *newname, int flags);
 int ksmbd_vfs_truncate(struct ksmbd_work *work,
 		       struct ksmbd_file *fp, loff_t size);
 struct srv_copychunk;
@@ -116,9 +114,9 @@ int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name,
 				size_t *xattr_stream_name_size, int s_type);
 int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap,
 			   struct dentry *dentry, char *attr_name);
-int ksmbd_vfs_kern_path(struct ksmbd_work *work,
-			char *name, unsigned int flags, struct path *path,
-			bool caseless);
+int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
+			       unsigned int flags, struct path *path,
+			       bool caseless);
 struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work,
 					  const char *name,
 					  unsigned int flags,
@@ -131,8 +129,7 @@ struct file_allocated_range_buffer;
 int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
 			 struct file_allocated_range_buffer *ranges,
 			 unsigned int in_count, unsigned int *out_count);
-int ksmbd_vfs_unlink(struct mnt_idmap *idmap, struct dentry *dir,
-		     struct dentry *dentry);
+int ksmbd_vfs_unlink(struct file *filp);
 void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat);
 int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work,
 				struct mnt_idmap *idmap,
diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c
index 054a7d2e0f48..2d0138e72d78 100644
--- a/fs/ksmbd/vfs_cache.c
+++ b/fs/ksmbd/vfs_cache.c
@@ -244,7 +244,6 @@ void ksmbd_release_inode_hash(void)
 
 static void __ksmbd_inode_close(struct ksmbd_file *fp)
 {
-	struct dentry *dir, *dentry;
 	struct ksmbd_inode *ci = fp->f_ci;
 	int err;
 	struct file *filp;
@@ -263,11 +262,9 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp)
 	if (atomic_dec_and_test(&ci->m_count)) {
 		write_lock(&ci->m_lock);
 		if (ci->m_flags & (S_DEL_ON_CLS | S_DEL_PENDING)) {
-			dentry = filp->f_path.dentry;
-			dir = dentry->d_parent;
 			ci->m_flags &= ~(S_DEL_ON_CLS | S_DEL_PENDING);
 			write_unlock(&ci->m_lock);
-			ksmbd_vfs_unlink(file_mnt_idmap(filp), dir, dentry);
+			ksmbd_vfs_unlink(filp);
 			write_lock(&ci->m_lock);
 		}
 		write_unlock(&ci->m_lock);
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index 6d5e83ed4476..ac9f9d84510e 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -3,10 +3,12 @@
 # Makefile for the linux lock manager stuff
 #
 
+ccflags-y += -I$(src)			# needed for trace events
+
 obj-$(CONFIG_LOCKD) += lockd.o
 
-lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
-	        svcshare.o svcproc.o svcsubs.o mon.o xdr.o
+lockd-objs-y += clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
+	        svcshare.o svcproc.o svcsubs.o mon.o trace.o xdr.o
 lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
 lockd-objs-$(CONFIG_PROC_FS) += procfs.o
 lockd-objs		      := $(lockd-objs-y)
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 82b19a30e0f0..e3972aa3045a 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -14,9 +14,12 @@
 #include <linux/nfs_fs.h>
 #include <linux/sunrpc/addr.h>
 #include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svc_xprt.h>
 #include <linux/lockd/lockd.h>
 #include <linux/kthread.h>
 
+#include "trace.h"
+
 #define NLMDBG_FACILITY		NLMDBG_CLIENT
 
 /*
@@ -29,18 +32,6 @@ static int			reclaimer(void *ptr);
  * client perspective.
  */
 
-/*
- * This is the representation of a blocked client lock.
- */
-struct nlm_wait {
-	struct list_head	b_list;		/* linked list */
-	wait_queue_head_t	b_wait;		/* where to wait on */
-	struct nlm_host *	b_host;
-	struct file_lock *	b_lock;		/* local file lock */
-	unsigned short		b_reclaim;	/* got to reclaim lock */
-	__be32			b_status;	/* grant callback status */
-};
-
 static LIST_HEAD(nlm_blocked);
 static DEFINE_SPINLOCK(nlm_blocked_lock);
 
@@ -94,41 +85,42 @@ void nlmclnt_done(struct nlm_host *host)
 }
 EXPORT_SYMBOL_GPL(nlmclnt_done);
 
+void nlmclnt_prepare_block(struct nlm_wait *block, struct nlm_host *host, struct file_lock *fl)
+{
+	block->b_host = host;
+	block->b_lock = fl;
+	init_waitqueue_head(&block->b_wait);
+	block->b_status = nlm_lck_blocked;
+}
+
 /*
  * Queue up a lock for blocking so that the GRANTED request can see it
  */
-struct nlm_wait *nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *fl)
+void nlmclnt_queue_block(struct nlm_wait *block)
 {
-	struct nlm_wait *block;
-
-	block = kmalloc(sizeof(*block), GFP_KERNEL);
-	if (block != NULL) {
-		block->b_host = host;
-		block->b_lock = fl;
-		init_waitqueue_head(&block->b_wait);
-		block->b_status = nlm_lck_blocked;
-
-		spin_lock(&nlm_blocked_lock);
-		list_add(&block->b_list, &nlm_blocked);
-		spin_unlock(&nlm_blocked_lock);
-	}
-	return block;
+	spin_lock(&nlm_blocked_lock);
+	list_add(&block->b_list, &nlm_blocked);
+	spin_unlock(&nlm_blocked_lock);
 }
 
-void nlmclnt_finish_block(struct nlm_wait *block)
+/*
+ * Dequeue the block and return its final status
+ */
+__be32 nlmclnt_dequeue_block(struct nlm_wait *block)
 {
-	if (block == NULL)
-		return;
+	__be32 status;
+
 	spin_lock(&nlm_blocked_lock);
 	list_del(&block->b_list);
+	status = block->b_status;
 	spin_unlock(&nlm_blocked_lock);
-	kfree(block);
+	return status;
 }
 
 /*
  * Block on a lock
  */
-int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
+int nlmclnt_wait(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
 {
 	long ret;
 
@@ -154,7 +146,6 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
 	/* Reset the lock status after a server reboot so we resend */
 	if (block->b_status == nlm_lck_denied_grace_period)
 		block->b_status = nlm_lck_blocked;
-	req->a_res.status = block->b_status;
 	return 0;
 }
 
@@ -198,6 +189,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
 		res = nlm_granted;
 	}
 	spin_unlock(&nlm_blocked_lock);
+	trace_nlmclnt_grant(lock, addr, svc_addr_len(addr), res);
 	return res;
 }
 
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 16b4de868cd2..fba6c7fa7474 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -20,6 +20,8 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
 
+#include "trace.h"
+
 #define NLMDBG_FACILITY		NLMDBG_CLIENT
 #define NLMCLNT_GRACE_WAIT	(5*HZ)
 #define NLMCLNT_POLL_TIMEOUT	(30*HZ)
@@ -451,6 +453,9 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
 			status = nlm_stat_to_errno(req->a_res.status);
 	}
 out:
+	trace_nlmclnt_test(&req->a_args.lock,
+			   (const struct sockaddr *)&req->a_host->h_addr,
+			   req->a_host->h_addrlen, req->a_res.status);
 	nlmclnt_release_call(req);
 	return status;
 }
@@ -516,9 +521,10 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 	const struct cred *cred = nfs_file_cred(fl->fl_file);
 	struct nlm_host	*host = req->a_host;
 	struct nlm_res	*resp = &req->a_res;
-	struct nlm_wait *block = NULL;
+	struct nlm_wait block;
 	unsigned char fl_flags = fl->fl_flags;
 	unsigned char fl_type;
+	__be32 b_status;
 	int status = -ENOLCK;
 
 	if (nsm_monitor(host) < 0)
@@ -531,31 +537,41 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 	if (status < 0)
 		goto out;
 
-	block = nlmclnt_prepare_block(host, fl);
+	nlmclnt_prepare_block(&block, host, fl);
 again:
 	/*
 	 * Initialise resp->status to a valid non-zero value,
 	 * since 0 == nlm_lck_granted
 	 */
 	resp->status = nlm_lck_blocked;
-	for(;;) {
+
+	/*
+	 * A GRANTED callback can come at any time -- even before the reply
+	 * to the LOCK request arrives, so we queue the wait before
+	 * requesting the lock.
+	 */
+	nlmclnt_queue_block(&block);
+	for (;;) {
 		/* Reboot protection */
 		fl->fl_u.nfs_fl.state = host->h_state;
 		status = nlmclnt_call(cred, req, NLMPROC_LOCK);
 		if (status < 0)
 			break;
 		/* Did a reclaimer thread notify us of a server reboot? */
-		if (resp->status ==  nlm_lck_denied_grace_period)
+		if (resp->status == nlm_lck_denied_grace_period)
 			continue;
 		if (resp->status != nlm_lck_blocked)
 			break;
 		/* Wait on an NLM blocking lock */
-		status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT);
+		status = nlmclnt_wait(&block, req, NLMCLNT_POLL_TIMEOUT);
 		if (status < 0)
 			break;
-		if (resp->status != nlm_lck_blocked)
+		if (block.b_status != nlm_lck_blocked)
 			break;
 	}
+	b_status = nlmclnt_dequeue_block(&block);
+	if (resp->status == nlm_lck_blocked)
+		resp->status = b_status;
 
 	/* if we were interrupted while blocking, then cancel the lock request
 	 * and exit
@@ -564,7 +580,7 @@ again:
 		if (!req->a_args.block)
 			goto out_unlock;
 		if (nlmclnt_cancel(host, req->a_args.block, fl) == 0)
-			goto out_unblock;
+			goto out;
 	}
 
 	if (resp->status == nlm_granted) {
@@ -593,16 +609,19 @@ again:
 		status = -ENOLCK;
 	else
 		status = nlm_stat_to_errno(resp->status);
-out_unblock:
-	nlmclnt_finish_block(block);
 out:
+	trace_nlmclnt_lock(&req->a_args.lock,
+			   (const struct sockaddr *)&req->a_host->h_addr,
+			   req->a_host->h_addrlen, req->a_res.status);
 	nlmclnt_release_call(req);
 	return status;
 out_unlock:
 	/* Fatal error: ensure that we remove the lock altogether */
+	trace_nlmclnt_lock(&req->a_args.lock,
+			   (const struct sockaddr *)&req->a_host->h_addr,
+			   req->a_host->h_addrlen, req->a_res.status);
 	dprintk("lockd: lock attempt ended in fatal error.\n"
 		"       Attempting to unlock.\n");
-	nlmclnt_finish_block(block);
 	fl_type = fl->fl_type;
 	fl->fl_type = F_UNLCK;
 	down_read(&host->h_rwsem);
@@ -696,6 +715,9 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 	/* What to do now? I'm out of my depth... */
 	status = -ENOLCK;
 out:
+	trace_nlmclnt_unlock(&req->a_args.lock,
+			     (const struct sockaddr *)&req->a_host->h_addr,
+			     req->a_host->h_addrlen, req->a_res.status);
 	nlmclnt_release_call(req);
 	return status;
 }
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index cdc8e12cdac4..127a728fcbc8 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -629,6 +629,7 @@ nlm_shutdown_hosts_net(struct net *net)
 			rpc_shutdown_client(host->h_rpcclnt);
 			host->h_rpcclnt = NULL;
 		}
+		nlmsvc_free_host_resources(host);
 	}
 
 	/* Then, perform a garbage collection pass */
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 9a47303b2cba..bb94949bc223 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -510,24 +510,6 @@ static struct ctl_table nlm_sysctls[] = {
 	{ }
 };
 
-static struct ctl_table nlm_sysctl_dir[] = {
-	{
-		.procname	= "nfs",
-		.mode		= 0555,
-		.child		= nlm_sysctls,
-	},
-	{ }
-};
-
-static struct ctl_table nlm_sysctl_root[] = {
-	{
-		.procname	= "fs",
-		.mode		= 0555,
-		.child		= nlm_sysctl_dir,
-	},
-	{ }
-};
-
 #endif	/* CONFIG_SYSCTL */
 
 /*
@@ -644,7 +626,7 @@ static int __init init_nlm(void)
 
 #ifdef CONFIG_SYSCTL
 	err = -ENOMEM;
-	nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root);
+	nlm_sysctl_table = register_sysctl("fs/nfs", nlm_sysctls);
 	if (nlm_sysctl_table == NULL)
 		goto err_sysctl;
 #endif
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 4e30f3c50970..c43ccdf28ed9 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -954,19 +954,32 @@ void
 nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status)
 {
 	struct nlm_block	*block;
+	struct file_lock	*fl;
+	int			error;
 
 	dprintk("grant_reply: looking for cookie %x, s=%d \n",
 		*(unsigned int *)(cookie->data), status);
 	if (!(block = nlmsvc_find_block(cookie)))
 		return;
 
-	if (status == nlm_lck_denied_grace_period) {
+	switch (status) {
+	case nlm_lck_denied_grace_period:
 		/* Try again in a couple of seconds */
 		nlmsvc_insert_block(block, 10 * HZ);
-	} else {
+		break;
+	case nlm_lck_denied:
+		/* Client doesn't want it, just unlock it */
+		nlmsvc_unlink_block(block);
+		fl = &block->b_call->a_args.lock.fl;
+		fl->fl_type = F_UNLCK;
+		error = vfs_lock_file(fl->fl_file, F_SETLK, fl, NULL);
+		if (error)
+			pr_warn("lockd: unable to unlock lock rejected by client!\n");
+		break;
+	default:
 		/*
-		 * Lock is now held by client, or has been rejected.
-		 * In both cases, the block should be removed.
+		 * Either it was accepted or the status makes no sense
+		 * just unlink it either way.
 		 */
 		nlmsvc_unlink_block(block);
 	}
diff --git a/fs/lockd/trace.c b/fs/lockd/trace.c
new file mode 100644
index 000000000000..d9a6ff6e673c
--- /dev/null
+++ b/fs/lockd/trace.c
@@ -0,0 +1,3 @@
+// SPDX-License-Identifier: GPL-2.0
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/fs/lockd/trace.h b/fs/lockd/trace.h
new file mode 100644
index 000000000000..7461b13b6e74
--- /dev/null
+++ b/fs/lockd/trace.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM lockd
+
+#if !defined(_TRACE_LOCKD_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_LOCKD_H
+
+#include <linux/tracepoint.h>
+#include <linux/crc32.h>
+#include <linux/nfs.h>
+#include <linux/lockd/lockd.h>
+
+#ifdef CONFIG_LOCKD_V4
+#define NLM_STATUS_LIST					\
+	nlm_status_code(LCK_GRANTED)			\
+	nlm_status_code(LCK_DENIED)			\
+	nlm_status_code(LCK_DENIED_NOLOCKS)		\
+	nlm_status_code(LCK_BLOCKED)			\
+	nlm_status_code(LCK_DENIED_GRACE_PERIOD)	\
+	nlm_status_code(DEADLCK)			\
+	nlm_status_code(ROFS)				\
+	nlm_status_code(STALE_FH)			\
+	nlm_status_code(FBIG)				\
+	nlm_status_code_end(FAILED)
+#else
+#define NLM_STATUS_LIST					\
+	nlm_status_code(LCK_GRANTED)			\
+	nlm_status_code(LCK_DENIED)			\
+	nlm_status_code(LCK_DENIED_NOLOCKS)		\
+	nlm_status_code(LCK_BLOCKED)			\
+	nlm_status_code_end(LCK_DENIED_GRACE_PERIOD)
+#endif
+
+#undef nlm_status_code
+#undef nlm_status_code_end
+#define nlm_status_code(x)	TRACE_DEFINE_ENUM(NLM_##x);
+#define nlm_status_code_end(x)	TRACE_DEFINE_ENUM(NLM_##x);
+
+NLM_STATUS_LIST
+
+#undef nlm_status_code
+#undef nlm_status_code_end
+#define nlm_status_code(x)	{ NLM_##x, #x },
+#define nlm_status_code_end(x)	{ NLM_##x, #x }
+
+#define show_nlm_status(x)	__print_symbolic(x, NLM_STATUS_LIST)
+
+DECLARE_EVENT_CLASS(nlmclnt_lock_event,
+		TP_PROTO(
+			const struct nlm_lock *lock,
+			const struct sockaddr *addr,
+			unsigned int addrlen,
+			__be32 status
+		),
+
+		TP_ARGS(lock, addr, addrlen, status),
+
+		TP_STRUCT__entry(
+			__field(u32, oh)
+			__field(u32, svid)
+			__field(u32, fh)
+			__field(unsigned long, status)
+			__field(u64, start)
+			__field(u64, len)
+			__sockaddr(addr, addrlen)
+		),
+
+		TP_fast_assign(
+			__entry->oh = ~crc32_le(0xffffffff, lock->oh.data, lock->oh.len);
+			__entry->svid = lock->svid;
+			__entry->fh = nfs_fhandle_hash(&lock->fh);
+			__entry->start = lock->lock_start;
+			__entry->len = lock->lock_len;
+			__entry->status = be32_to_cpu(status);
+			__assign_sockaddr(addr, addr, addrlen);
+		),
+
+		TP_printk(
+			"addr=%pISpc oh=0x%08x svid=0x%08x fh=0x%08x start=%llu len=%llu status=%s",
+			__get_sockaddr(addr), __entry->oh, __entry->svid,
+			__entry->fh, __entry->start, __entry->len,
+			show_nlm_status(__entry->status)
+		)
+);
+
+#define DEFINE_NLMCLNT_EVENT(name)				\
+	DEFINE_EVENT(nlmclnt_lock_event, name,			\
+			TP_PROTO(				\
+				const struct nlm_lock *lock,	\
+				const struct sockaddr *addr,	\
+				unsigned int addrlen,		\
+				__be32	status			\
+			),					\
+			TP_ARGS(lock, addr, addrlen, status))
+
+DEFINE_NLMCLNT_EVENT(nlmclnt_test);
+DEFINE_NLMCLNT_EVENT(nlmclnt_lock);
+DEFINE_NLMCLNT_EVENT(nlmclnt_unlock);
+DEFINE_NLMCLNT_EVENT(nlmclnt_grant);
+
+#endif /* _TRACE_LOCKD_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/fs/mpage.c b/fs/mpage.c
index 22b9de5ddd68..242e213ee064 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -43,23 +43,49 @@
  * status of that page is hard.  See end_buffer_async_read() for the details.
  * There is no point in duplicating all that complexity.
  */
-static void mpage_end_io(struct bio *bio)
+static void mpage_read_end_io(struct bio *bio)
 {
-	struct bio_vec *bv;
-	struct bvec_iter_all iter_all;
+	struct folio_iter fi;
+	int err = blk_status_to_errno(bio->bi_status);
+
+	bio_for_each_folio_all(fi, bio) {
+		if (err)
+			folio_set_error(fi.folio);
+		else
+			folio_mark_uptodate(fi.folio);
+		folio_unlock(fi.folio);
+	}
+
+	bio_put(bio);
+}
 
-	bio_for_each_segment_all(bv, bio, iter_all) {
-		struct page *page = bv->bv_page;
-		page_endio(page, bio_op(bio),
-			   blk_status_to_errno(bio->bi_status));
+static void mpage_write_end_io(struct bio *bio)
+{
+	struct folio_iter fi;
+	int err = blk_status_to_errno(bio->bi_status);
+
+	bio_for_each_folio_all(fi, bio) {
+		if (err) {
+			folio_set_error(fi.folio);
+			mapping_set_error(fi.folio->mapping, err);
+		}
+		folio_end_writeback(fi.folio);
 	}
 
 	bio_put(bio);
 }
 
-static struct bio *mpage_bio_submit(struct bio *bio)
+static struct bio *mpage_bio_submit_read(struct bio *bio)
+{
+	bio->bi_end_io = mpage_read_end_io;
+	guard_bio_eod(bio);
+	submit_bio(bio);
+	return NULL;
+}
+
+static struct bio *mpage_bio_submit_write(struct bio *bio)
 {
-	bio->bi_end_io = mpage_end_io;
+	bio->bi_end_io = mpage_write_end_io;
 	guard_bio_eod(bio);
 	submit_bio(bio);
 	return NULL;
@@ -265,7 +291,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
 	 * This folio will go to BIO.  Do we need to send this BIO off first?
 	 */
 	if (args->bio && (args->last_block_in_bio != blocks[0] - 1))
-		args->bio = mpage_bio_submit(args->bio);
+		args->bio = mpage_bio_submit_read(args->bio);
 
 alloc_new:
 	if (args->bio == NULL) {
@@ -278,7 +304,7 @@ alloc_new:
 
 	length = first_hole << blkbits;
 	if (!bio_add_folio(args->bio, folio, length, 0)) {
-		args->bio = mpage_bio_submit(args->bio);
+		args->bio = mpage_bio_submit_read(args->bio);
 		goto alloc_new;
 	}
 
@@ -286,7 +312,7 @@ alloc_new:
 	nblocks = map_bh->b_size >> blkbits;
 	if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
 	    (first_hole != blocks_per_page))
-		args->bio = mpage_bio_submit(args->bio);
+		args->bio = mpage_bio_submit_read(args->bio);
 	else
 		args->last_block_in_bio = blocks[blocks_per_page - 1];
 out:
@@ -294,7 +320,7 @@ out:
 
 confused:
 	if (args->bio)
-		args->bio = mpage_bio_submit(args->bio);
+		args->bio = mpage_bio_submit_read(args->bio);
 	if (!folio_test_uptodate(folio))
 		block_read_full_folio(folio, args->get_block);
 	else
@@ -356,7 +382,7 @@ void mpage_readahead(struct readahead_control *rac, get_block_t get_block)
 		args.bio = do_mpage_readpage(&args);
 	}
 	if (args.bio)
-		mpage_bio_submit(args.bio);
+		mpage_bio_submit_read(args.bio);
 }
 EXPORT_SYMBOL(mpage_readahead);
 
@@ -373,7 +399,7 @@ int mpage_read_folio(struct folio *folio, get_block_t get_block)
 
 	args.bio = do_mpage_readpage(&args);
 	if (args.bio)
-		mpage_bio_submit(args.bio);
+		mpage_bio_submit_read(args.bio);
 	return 0;
 }
 EXPORT_SYMBOL(mpage_read_folio);
@@ -577,7 +603,7 @@ page_is_mapped:
 	 * This page will go to BIO.  Do we need to send this BIO off first?
 	 */
 	if (bio && mpd->last_block_in_bio != blocks[0] - 1)
-		bio = mpage_bio_submit(bio);
+		bio = mpage_bio_submit_write(bio);
 
 alloc_new:
 	if (bio == NULL) {
@@ -596,7 +622,7 @@ alloc_new:
 	wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio));
 	length = first_unmapped << blkbits;
 	if (!bio_add_folio(bio, folio, length, 0)) {
-		bio = mpage_bio_submit(bio);
+		bio = mpage_bio_submit_write(bio);
 		goto alloc_new;
 	}
 
@@ -606,7 +632,7 @@ alloc_new:
 	folio_start_writeback(folio);
 	folio_unlock(folio);
 	if (boundary || (first_unmapped != blocks_per_page)) {
-		bio = mpage_bio_submit(bio);
+		bio = mpage_bio_submit_write(bio);
 		if (boundary_block) {
 			write_boundary_block(boundary_bdev,
 					boundary_block, 1 << blkbits);
@@ -618,7 +644,7 @@ alloc_new:
 
 confused:
 	if (bio)
-		bio = mpage_bio_submit(bio);
+		bio = mpage_bio_submit_write(bio);
 
 	/*
 	 * The caller has a ref on the inode, so *mapping is stable
@@ -652,7 +678,7 @@ mpage_writepages(struct address_space *mapping,
 	blk_start_plug(&plug);
 	ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
 	if (mpd.bio)
-		mpage_bio_submit(mpd.bio);
+		mpage_bio_submit_write(mpd.bio);
 	blk_finish_plug(&plug);
 	return ret;
 }
diff --git a/fs/namei.c b/fs/namei.c
index f04f7be58933..e4fe0879ae55 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -254,6 +254,7 @@ getname_kernel(const char * filename)
 
 	return result;
 }
+EXPORT_SYMBOL(getname_kernel);
 
 void putname(struct filename *name)
 {
@@ -271,6 +272,7 @@ void putname(struct filename *name)
 	} else
 		__putname(name);
 }
+EXPORT_SYMBOL(putname);
 
 /**
  * check_acl - perform ACL permission checking
@@ -1581,8 +1583,9 @@ static struct dentry *lookup_dcache(const struct qstr *name,
  * when directory is guaranteed to have no in-lookup children
  * at all.
  */
-static struct dentry *__lookup_hash(const struct qstr *name,
-		struct dentry *base, unsigned int flags)
+struct dentry *lookup_one_qstr_excl(const struct qstr *name,
+				    struct dentry *base,
+				    unsigned int flags)
 {
 	struct dentry *dentry = lookup_dcache(name, base, flags);
 	struct dentry *old;
@@ -1606,6 +1609,7 @@ static struct dentry *__lookup_hash(const struct qstr *name,
 	}
 	return dentry;
 }
+EXPORT_SYMBOL(lookup_one_qstr_excl);
 
 static struct dentry *lookup_fast(struct nameidata *nd)
 {
@@ -2532,16 +2536,17 @@ static int path_parentat(struct nameidata *nd, unsigned flags,
 }
 
 /* Note: this does not consume "name" */
-static int filename_parentat(int dfd, struct filename *name,
-			     unsigned int flags, struct path *parent,
-			     struct qstr *last, int *type)
+static int __filename_parentat(int dfd, struct filename *name,
+			       unsigned int flags, struct path *parent,
+			       struct qstr *last, int *type,
+			       const struct path *root)
 {
 	int retval;
 	struct nameidata nd;
 
 	if (IS_ERR(name))
 		return PTR_ERR(name);
-	set_nameidata(&nd, dfd, name, NULL);
+	set_nameidata(&nd, dfd, name, root);
 	retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
 	if (unlikely(retval == -ECHILD))
 		retval = path_parentat(&nd, flags, parent);
@@ -2556,6 +2561,13 @@ static int filename_parentat(int dfd, struct filename *name,
 	return retval;
 }
 
+static int filename_parentat(int dfd, struct filename *name,
+			     unsigned int flags, struct path *parent,
+			     struct qstr *last, int *type)
+{
+	return __filename_parentat(dfd, name, flags, parent, last, type, NULL);
+}
+
 /* does lookup, returns the object with parent locked */
 static struct dentry *__kern_path_locked(struct filename *name, struct path *path)
 {
@@ -2571,7 +2583,7 @@ static struct dentry *__kern_path_locked(struct filename *name, struct path *pat
 		return ERR_PTR(-EINVAL);
 	}
 	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
-	d = __lookup_hash(&last, path->dentry, 0);
+	d = lookup_one_qstr_excl(&last, path->dentry, 0);
 	if (IS_ERR(d)) {
 		inode_unlock(path->dentry->d_inode);
 		path_put(path);
@@ -2600,6 +2612,24 @@ int kern_path(const char *name, unsigned int flags, struct path *path)
 EXPORT_SYMBOL(kern_path);
 
 /**
+ * vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair
+ * @filename: filename structure
+ * @flags: lookup flags
+ * @parent: pointer to struct path to fill
+ * @last: last component
+ * @type: type of the last component
+ * @root: pointer to struct path of the base directory
+ */
+int vfs_path_parent_lookup(struct filename *filename, unsigned int flags,
+			   struct path *parent, struct qstr *last, int *type,
+			   const struct path *root)
+{
+	return  __filename_parentat(AT_FDCWD, filename, flags, parent, last,
+				    type, root);
+}
+EXPORT_SYMBOL(vfs_path_parent_lookup);
+
+/**
  * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
  * @dentry:  pointer to dentry of the base directory
  * @mnt: pointer to vfs mount of the base directory
@@ -2980,20 +3010,10 @@ static inline int may_create(struct mnt_idmap *idmap,
 	return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
 }
 
-/*
- * p1 and p2 should be directories on the same fs.
- */
-struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
+static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2)
 {
 	struct dentry *p;
 
-	if (p1 == p2) {
-		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
-		return NULL;
-	}
-
-	mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
-
 	p = d_ancestor(p2, p1);
 	if (p) {
 		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
@@ -3012,8 +3032,64 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
 	inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
 	return NULL;
 }
+
+/*
+ * p1 and p2 should be directories on the same fs.
+ */
+struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
+{
+	if (p1 == p2) {
+		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+		return NULL;
+	}
+
+	mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
+	return lock_two_directories(p1, p2);
+}
 EXPORT_SYMBOL(lock_rename);
 
+/*
+ * c1 and p2 should be on the same fs.
+ */
+struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2)
+{
+	if (READ_ONCE(c1->d_parent) == p2) {
+		/*
+		 * hopefully won't need to touch ->s_vfs_rename_mutex at all.
+		 */
+		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
+		/*
+		 * now that p2 is locked, nobody can move in or out of it,
+		 * so the test below is safe.
+		 */
+		if (likely(c1->d_parent == p2))
+			return NULL;
+
+		/*
+		 * c1 got moved out of p2 while we'd been taking locks;
+		 * unlock and fall back to slow case.
+		 */
+		inode_unlock(p2->d_inode);
+	}
+
+	mutex_lock(&c1->d_sb->s_vfs_rename_mutex);
+	/*
+	 * nobody can move out of any directories on this fs.
+	 */
+	if (likely(c1->d_parent != p2))
+		return lock_two_directories(c1->d_parent, p2);
+
+	/*
+	 * c1 got moved into p2 while we were taking locks;
+	 * we need p2 locked and ->s_vfs_rename_mutex unlocked,
+	 * for consistency with lock_rename().
+	 */
+	inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
+	mutex_unlock(&c1->d_sb->s_vfs_rename_mutex);
+	return NULL;
+}
+EXPORT_SYMBOL(lock_rename_child);
+
 void unlock_rename(struct dentry *p1, struct dentry *p2)
 {
 	inode_unlock(p1->d_inode);
@@ -3806,7 +3882,8 @@ static struct dentry *filename_create(int dfd, struct filename *name,
 	if (last.name[last.len] && !want_dir)
 		create_flags = 0;
 	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
-	dentry = __lookup_hash(&last, path->dentry, reval_flag | create_flags);
+	dentry = lookup_one_qstr_excl(&last, path->dentry,
+				      reval_flag | create_flags);
 	if (IS_ERR(dentry))
 		goto unlock;
 
@@ -4166,7 +4243,7 @@ retry:
 		goto exit2;
 
 	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
-	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
+	dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
 	error = PTR_ERR(dentry);
 	if (IS_ERR(dentry))
 		goto exit3;
@@ -4299,7 +4376,7 @@ retry:
 		goto exit2;
 retry_deleg:
 	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
-	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
+	dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
 	error = PTR_ERR(dentry);
 	if (!IS_ERR(dentry)) {
 
@@ -4863,7 +4940,8 @@ retry:
 retry_deleg:
 	trap = lock_rename(new_path.dentry, old_path.dentry);
 
-	old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
+	old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry,
+					  lookup_flags);
 	error = PTR_ERR(old_dentry);
 	if (IS_ERR(old_dentry))
 		goto exit3;
@@ -4871,7 +4949,8 @@ retry_deleg:
 	error = -ENOENT;
 	if (d_is_negative(old_dentry))
 		goto exit4;
-	new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
+	new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry,
+					  lookup_flags | target_flags);
 	error = PTR_ERR(new_dentry);
 	if (IS_ERR(new_dentry))
 		goto exit4;
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index e3d754a9e1b0..3404707ddbe7 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -349,8 +349,8 @@ int netfs_write_begin(struct netfs_inode *ctx,
 retry:
 	folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
 				    mapping_gfp_mask(mapping));
-	if (!folio)
-		return -ENOMEM;
+	if (IS_ERR(folio))
+		return PTR_ERR(folio);
 
 	if (ctx->ops->check_write_begin) {
 		/* Allow the netfs (eg. ceph) to flush conflicts. */
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index c1c7ed2fd860..b6fc169be1b1 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -170,6 +170,7 @@ config ROOT_NFS
 config NFS_FSCACHE
 	bool "Provide NFS client caching support"
 	depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
+	select NETFS_SUPPORT
 	help
 	  Say Y here if you want NFS data to be cached locally on disc through
 	  the general filesystem cache manager
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 6fbcbb8d6587..8257de6dba45 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -55,7 +55,7 @@ static int nfs_closedir(struct inode *, struct file *);
 static int nfs_readdir(struct file *, struct dir_context *);
 static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
-static void nfs_readdir_free_folio(struct folio *);
+static void nfs_readdir_clear_array(struct folio *);
 
 const struct file_operations nfs_dir_operations = {
 	.llseek		= nfs_llseek_dir,
@@ -67,7 +67,7 @@ const struct file_operations nfs_dir_operations = {
 };
 
 const struct address_space_operations nfs_dir_aops = {
-	.free_folio = nfs_readdir_free_folio,
+	.free_folio = nfs_readdir_clear_array,
 };
 
 #define NFS_INIT_DTSIZE PAGE_SIZE
@@ -146,18 +146,18 @@ struct nfs_cache_array {
 	u64 change_attr;
 	u64 last_cookie;
 	unsigned int size;
-	unsigned char page_full : 1,
-		      page_is_eof : 1,
+	unsigned char folio_full : 1,
+		      folio_is_eof : 1,
 		      cookies_are_ordered : 1;
 	struct nfs_cache_array_entry array[];
 };
 
 struct nfs_readdir_descriptor {
 	struct file	*file;
-	struct page	*page;
+	struct folio	*folio;
 	struct dir_context *ctx;
-	pgoff_t		page_index;
-	pgoff_t		page_index_max;
+	pgoff_t		folio_index;
+	pgoff_t		folio_index_max;
 	u64		dir_cookie;
 	u64		last_cookie;
 	loff_t		current_index;
@@ -198,17 +198,17 @@ static void nfs_grow_dtsize(struct nfs_readdir_descriptor *desc)
 	nfs_set_dtsize(desc, desc->dtsize << 1);
 }
 
-static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie,
-					u64 change_attr)
+static void nfs_readdir_folio_init_array(struct folio *folio, u64 last_cookie,
+					 u64 change_attr)
 {
 	struct nfs_cache_array *array;
 
-	array = kmap_local_page(page);
+	array = kmap_local_folio(folio, 0);
 	array->change_attr = change_attr;
 	array->last_cookie = last_cookie;
 	array->size = 0;
-	array->page_full = 0;
-	array->page_is_eof = 0;
+	array->folio_full = 0;
+	array->folio_is_eof = 0;
 	array->cookies_are_ordered = 1;
 	kunmap_local(array);
 }
@@ -216,44 +216,39 @@ static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie,
 /*
  * we are freeing strings created by nfs_add_to_readdir_array()
  */
-static void nfs_readdir_clear_array(struct page *page)
+static void nfs_readdir_clear_array(struct folio *folio)
 {
 	struct nfs_cache_array *array;
 	unsigned int i;
 
-	array = kmap_local_page(page);
+	array = kmap_local_folio(folio, 0);
 	for (i = 0; i < array->size; i++)
 		kfree(array->array[i].name);
 	array->size = 0;
 	kunmap_local(array);
 }
 
-static void nfs_readdir_free_folio(struct folio *folio)
+static void nfs_readdir_folio_reinit_array(struct folio *folio, u64 last_cookie,
+					   u64 change_attr)
 {
-	nfs_readdir_clear_array(&folio->page);
+	nfs_readdir_clear_array(folio);
+	nfs_readdir_folio_init_array(folio, last_cookie, change_attr);
 }
 
-static void nfs_readdir_page_reinit_array(struct page *page, u64 last_cookie,
-					  u64 change_attr)
+static struct folio *
+nfs_readdir_folio_array_alloc(u64 last_cookie, gfp_t gfp_flags)
 {
-	nfs_readdir_clear_array(page);
-	nfs_readdir_page_init_array(page, last_cookie, change_attr);
+	struct folio *folio = folio_alloc(gfp_flags, 0);
+	if (folio)
+		nfs_readdir_folio_init_array(folio, last_cookie, 0);
+	return folio;
 }
 
-static struct page *
-nfs_readdir_page_array_alloc(u64 last_cookie, gfp_t gfp_flags)
+static void nfs_readdir_folio_array_free(struct folio *folio)
 {
-	struct page *page = alloc_page(gfp_flags);
-	if (page)
-		nfs_readdir_page_init_array(page, last_cookie, 0);
-	return page;
-}
-
-static void nfs_readdir_page_array_free(struct page *page)
-{
-	if (page) {
-		nfs_readdir_clear_array(page);
-		put_page(page);
+	if (folio) {
+		nfs_readdir_clear_array(folio);
+		folio_put(folio);
 	}
 }
 
@@ -264,13 +259,13 @@ static u64 nfs_readdir_array_index_cookie(struct nfs_cache_array *array)
 
 static void nfs_readdir_array_set_eof(struct nfs_cache_array *array)
 {
-	array->page_is_eof = 1;
-	array->page_full = 1;
+	array->folio_is_eof = 1;
+	array->folio_full = 1;
 }
 
 static bool nfs_readdir_array_is_full(struct nfs_cache_array *array)
 {
-	return array->page_full;
+	return array->folio_full;
 }
 
 /*
@@ -302,18 +297,18 @@ static size_t nfs_readdir_array_maxentries(void)
  */
 static int nfs_readdir_array_can_expand(struct nfs_cache_array *array)
 {
-	if (array->page_full)
+	if (array->folio_full)
 		return -ENOSPC;
 	if (array->size == nfs_readdir_array_maxentries()) {
-		array->page_full = 1;
+		array->folio_full = 1;
 		return -ENOSPC;
 	}
 	return 0;
 }
 
-static int nfs_readdir_page_array_append(struct page *page,
-					 const struct nfs_entry *entry,
-					 u64 *cookie)
+static int nfs_readdir_folio_array_append(struct folio *folio,
+					  const struct nfs_entry *entry,
+					  u64 *cookie)
 {
 	struct nfs_cache_array *array;
 	struct nfs_cache_array_entry *cache_entry;
@@ -322,7 +317,7 @@ static int nfs_readdir_page_array_append(struct page *page,
 
 	name = nfs_readdir_copy_name(entry->name, entry->len);
 
-	array = kmap_atomic(page);
+	array = kmap_atomic(folio_page(folio, 0));
 	if (!name)
 		goto out;
 	ret = nfs_readdir_array_can_expand(array);
@@ -361,17 +356,17 @@ out:
  * 127 readdir entries for a typical 64-bit system, that works out to a
  * cache of ~ 33 million entries per directory.
  */
-static pgoff_t nfs_readdir_page_cookie_hash(u64 cookie)
+static pgoff_t nfs_readdir_folio_cookie_hash(u64 cookie)
 {
 	if (cookie == 0)
 		return 0;
 	return hash_64(cookie, 18);
 }
 
-static bool nfs_readdir_page_validate(struct page *page, u64 last_cookie,
-				      u64 change_attr)
+static bool nfs_readdir_folio_validate(struct folio *folio, u64 last_cookie,
+				       u64 change_attr)
 {
-	struct nfs_cache_array *array = kmap_local_page(page);
+	struct nfs_cache_array *array = kmap_local_folio(folio, 0);
 	int ret = true;
 
 	if (array->change_attr != change_attr)
@@ -382,81 +377,83 @@ static bool nfs_readdir_page_validate(struct page *page, u64 last_cookie,
 	return ret;
 }
 
-static void nfs_readdir_page_unlock_and_put(struct page *page)
+static void nfs_readdir_folio_unlock_and_put(struct folio *folio)
 {
-	unlock_page(page);
-	put_page(page);
+	folio_unlock(folio);
+	folio_put(folio);
 }
 
-static void nfs_readdir_page_init_and_validate(struct page *page, u64 cookie,
-					       u64 change_attr)
+static void nfs_readdir_folio_init_and_validate(struct folio *folio, u64 cookie,
+						u64 change_attr)
 {
-	if (PageUptodate(page)) {
-		if (nfs_readdir_page_validate(page, cookie, change_attr))
+	if (folio_test_uptodate(folio)) {
+		if (nfs_readdir_folio_validate(folio, cookie, change_attr))
 			return;
-		nfs_readdir_clear_array(page);
+		nfs_readdir_clear_array(folio);
 	}
-	nfs_readdir_page_init_array(page, cookie, change_attr);
-	SetPageUptodate(page);
+	nfs_readdir_folio_init_array(folio, cookie, change_attr);
+	folio_mark_uptodate(folio);
 }
 
-static struct page *nfs_readdir_page_get_locked(struct address_space *mapping,
-						u64 cookie, u64 change_attr)
+static struct folio *nfs_readdir_folio_get_locked(struct address_space *mapping,
+						  u64 cookie, u64 change_attr)
 {
-	pgoff_t index = nfs_readdir_page_cookie_hash(cookie);
-	struct page *page;
+	pgoff_t index = nfs_readdir_folio_cookie_hash(cookie);
+	struct folio *folio;
 
-	page = grab_cache_page(mapping, index);
-	if (!page)
+	folio = filemap_grab_folio(mapping, index);
+	if (!folio)
 		return NULL;
-	nfs_readdir_page_init_and_validate(page, cookie, change_attr);
-	return page;
+	nfs_readdir_folio_init_and_validate(folio, cookie, change_attr);
+	return folio;
 }
 
-static u64 nfs_readdir_page_last_cookie(struct page *page)
+static u64 nfs_readdir_folio_last_cookie(struct folio *folio)
 {
 	struct nfs_cache_array *array;
 	u64 ret;
 
-	array = kmap_local_page(page);
+	array = kmap_local_folio(folio, 0);
 	ret = array->last_cookie;
 	kunmap_local(array);
 	return ret;
 }
 
-static bool nfs_readdir_page_needs_filling(struct page *page)
+static bool nfs_readdir_folio_needs_filling(struct folio *folio)
 {
 	struct nfs_cache_array *array;
 	bool ret;
 
-	array = kmap_local_page(page);
+	array = kmap_local_folio(folio, 0);
 	ret = !nfs_readdir_array_is_full(array);
 	kunmap_local(array);
 	return ret;
 }
 
-static void nfs_readdir_page_set_eof(struct page *page)
+static void nfs_readdir_folio_set_eof(struct folio *folio)
 {
 	struct nfs_cache_array *array;
 
-	array = kmap_local_page(page);
+	array = kmap_local_folio(folio, 0);
 	nfs_readdir_array_set_eof(array);
 	kunmap_local(array);
 }
 
-static struct page *nfs_readdir_page_get_next(struct address_space *mapping,
-					      u64 cookie, u64 change_attr)
+static struct folio *nfs_readdir_folio_get_next(struct address_space *mapping,
+						u64 cookie, u64 change_attr)
 {
-	pgoff_t index = nfs_readdir_page_cookie_hash(cookie);
-	struct page *page;
+	pgoff_t index = nfs_readdir_folio_cookie_hash(cookie);
+	struct folio *folio;
 
-	page = grab_cache_page_nowait(mapping, index);
-	if (!page)
+	folio = __filemap_get_folio(mapping, index,
+			FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
+			mapping_gfp_mask(mapping));
+	if (!folio)
 		return NULL;
-	nfs_readdir_page_init_and_validate(page, cookie, change_attr);
-	if (nfs_readdir_page_last_cookie(page) != cookie)
-		nfs_readdir_page_reinit_array(page, cookie, change_attr);
-	return page;
+	nfs_readdir_folio_init_and_validate(folio, cookie, change_attr);
+	if (nfs_readdir_folio_last_cookie(folio) != cookie)
+		nfs_readdir_folio_reinit_array(folio, cookie, change_attr);
+	return folio;
 }
 
 static inline
@@ -481,11 +478,11 @@ bool nfs_readdir_use_cookie(const struct file *filp)
 static void nfs_readdir_seek_next_array(struct nfs_cache_array *array,
 					struct nfs_readdir_descriptor *desc)
 {
-	if (array->page_full) {
+	if (array->folio_full) {
 		desc->last_cookie = array->last_cookie;
 		desc->current_index += array->size;
 		desc->cache_entry_index = 0;
-		desc->page_index++;
+		desc->folio_index++;
 	} else
 		desc->last_cookie = nfs_readdir_array_index_cookie(array);
 }
@@ -494,7 +491,7 @@ static void nfs_readdir_rewind_search(struct nfs_readdir_descriptor *desc)
 {
 	desc->current_index = 0;
 	desc->last_cookie = 0;
-	desc->page_index = 0;
+	desc->folio_index = 0;
 }
 
 static int nfs_readdir_search_for_pos(struct nfs_cache_array *array,
@@ -506,7 +503,7 @@ static int nfs_readdir_search_for_pos(struct nfs_cache_array *array,
 	if (diff < 0)
 		goto out_eof;
 	if (diff >= array->size) {
-		if (array->page_is_eof)
+		if (array->folio_is_eof)
 			goto out_eof;
 		nfs_readdir_seek_next_array(array, desc);
 		return -EAGAIN;
@@ -554,7 +551,7 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,
 		}
 	}
 check_eof:
-	if (array->page_is_eof) {
+	if (array->folio_is_eof) {
 		status = -EBADCOOKIE;
 		if (desc->dir_cookie == array->last_cookie)
 			desc->eof = true;
@@ -568,7 +565,7 @@ static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc)
 	struct nfs_cache_array *array;
 	int status;
 
-	array = kmap_local_page(desc->page);
+	array = kmap_local_folio(desc->folio, 0);
 
 	if (desc->dir_cookie == 0)
 		status = nfs_readdir_search_for_pos(array, desc);
@@ -819,16 +816,17 @@ static int nfs_readdir_entry_decode(struct nfs_readdir_descriptor *desc,
 }
 
 /* Perform conversion from xdr to cache array */
-static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
-				   struct nfs_entry *entry,
-				   struct page **xdr_pages, unsigned int buflen,
-				   struct page **arrays, size_t narrays,
-				   u64 change_attr)
+static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc,
+				    struct nfs_entry *entry,
+				    struct page **xdr_pages, unsigned int buflen,
+				    struct folio **arrays, size_t narrays,
+				    u64 change_attr)
 {
 	struct address_space *mapping = desc->file->f_mapping;
+	struct folio *new, *folio = *arrays;
 	struct xdr_stream stream;
+	struct page *scratch;
 	struct xdr_buf buf;
-	struct page *scratch, *new, *page = *arrays;
 	u64 cookie;
 	int status;
 
@@ -844,36 +842,36 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
 		if (status != 0)
 			break;
 
-		status = nfs_readdir_page_array_append(page, entry, &cookie);
+		status = nfs_readdir_folio_array_append(folio, entry, &cookie);
 		if (status != -ENOSPC)
 			continue;
 
-		if (page->mapping != mapping) {
+		if (folio->mapping != mapping) {
 			if (!--narrays)
 				break;
-			new = nfs_readdir_page_array_alloc(cookie, GFP_KERNEL);
+			new = nfs_readdir_folio_array_alloc(cookie, GFP_KERNEL);
 			if (!new)
 				break;
 			arrays++;
-			*arrays = page = new;
+			*arrays = folio = new;
 		} else {
-			new = nfs_readdir_page_get_next(mapping, cookie,
-							change_attr);
+			new = nfs_readdir_folio_get_next(mapping, cookie,
+							 change_attr);
 			if (!new)
 				break;
-			if (page != *arrays)
-				nfs_readdir_page_unlock_and_put(page);
-			page = new;
+			if (folio != *arrays)
+				nfs_readdir_folio_unlock_and_put(folio);
+			folio = new;
 		}
-		desc->page_index_max++;
-		status = nfs_readdir_page_array_append(page, entry, &cookie);
+		desc->folio_index_max++;
+		status = nfs_readdir_folio_array_append(folio, entry, &cookie);
 	} while (!status && !entry->eof);
 
 	switch (status) {
 	case -EBADCOOKIE:
 		if (!entry->eof)
 			break;
-		nfs_readdir_page_set_eof(page);
+		nfs_readdir_folio_set_eof(folio);
 		fallthrough;
 	case -EAGAIN:
 		status = 0;
@@ -886,8 +884,8 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
 			;
 	}
 
-	if (page != *arrays)
-		nfs_readdir_page_unlock_and_put(page);
+	if (folio != *arrays)
+		nfs_readdir_folio_unlock_and_put(folio);
 
 	put_page(scratch);
 	return status;
@@ -927,11 +925,11 @@ out_freepages:
 
 static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
 				    __be32 *verf_arg, __be32 *verf_res,
-				    struct page **arrays, size_t narrays)
+				    struct folio **arrays, size_t narrays)
 {
 	u64 change_attr;
 	struct page **pages;
-	struct page *page = *arrays;
+	struct folio *folio = *arrays;
 	struct nfs_entry *entry;
 	size_t array_size;
 	struct inode *inode = file_inode(desc->file);
@@ -942,7 +940,7 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
 	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
 	if (!entry)
 		return -ENOMEM;
-	entry->cookie = nfs_readdir_page_last_cookie(page);
+	entry->cookie = nfs_readdir_folio_last_cookie(folio);
 	entry->fh = nfs_alloc_fhandle();
 	entry->fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode));
 	entry->server = NFS_SERVER(inode);
@@ -962,10 +960,10 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
 
 	pglen = status;
 	if (pglen != 0)
-		status = nfs_readdir_page_filler(desc, entry, pages, pglen,
-						 arrays, narrays, change_attr);
+		status = nfs_readdir_folio_filler(desc, entry, pages, pglen,
+						  arrays, narrays, change_attr);
 	else
-		nfs_readdir_page_set_eof(page);
+		nfs_readdir_folio_set_eof(folio);
 	desc->buffer_fills++;
 
 free_pages:
@@ -977,33 +975,33 @@ out:
 	return status;
 }
 
-static void nfs_readdir_page_put(struct nfs_readdir_descriptor *desc)
+static void nfs_readdir_folio_put(struct nfs_readdir_descriptor *desc)
 {
-	put_page(desc->page);
-	desc->page = NULL;
+	folio_put(desc->folio);
+	desc->folio = NULL;
 }
 
 static void
-nfs_readdir_page_unlock_and_put_cached(struct nfs_readdir_descriptor *desc)
+nfs_readdir_folio_unlock_and_put_cached(struct nfs_readdir_descriptor *desc)
 {
-	unlock_page(desc->page);
-	nfs_readdir_page_put(desc);
+	folio_unlock(desc->folio);
+	nfs_readdir_folio_put(desc);
 }
 
-static struct page *
-nfs_readdir_page_get_cached(struct nfs_readdir_descriptor *desc)
+static struct folio *
+nfs_readdir_folio_get_cached(struct nfs_readdir_descriptor *desc)
 {
 	struct address_space *mapping = desc->file->f_mapping;
 	u64 change_attr = inode_peek_iversion_raw(mapping->host);
 	u64 cookie = desc->last_cookie;
-	struct page *page;
+	struct folio *folio;
 
-	page = nfs_readdir_page_get_locked(mapping, cookie, change_attr);
-	if (!page)
+	folio = nfs_readdir_folio_get_locked(mapping, cookie, change_attr);
+	if (!folio)
 		return NULL;
-	if (desc->clear_cache && !nfs_readdir_page_needs_filling(page))
-		nfs_readdir_page_reinit_array(page, cookie, change_attr);
-	return page;
+	if (desc->clear_cache && !nfs_readdir_folio_needs_filling(folio))
+		nfs_readdir_folio_reinit_array(folio, cookie, change_attr);
+	return folio;
 }
 
 /*
@@ -1017,21 +1015,21 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 	__be32 verf[NFS_DIR_VERIFIER_SIZE];
 	int res;
 
-	desc->page = nfs_readdir_page_get_cached(desc);
-	if (!desc->page)
+	desc->folio = nfs_readdir_folio_get_cached(desc);
+	if (!desc->folio)
 		return -ENOMEM;
-	if (nfs_readdir_page_needs_filling(desc->page)) {
+	if (nfs_readdir_folio_needs_filling(desc->folio)) {
 		/* Grow the dtsize if we had to go back for more pages */
-		if (desc->page_index == desc->page_index_max)
+		if (desc->folio_index == desc->folio_index_max)
 			nfs_grow_dtsize(desc);
-		desc->page_index_max = desc->page_index;
+		desc->folio_index_max = desc->folio_index;
 		trace_nfs_readdir_cache_fill(desc->file, nfsi->cookieverf,
 					     desc->last_cookie,
-					     desc->page->index, desc->dtsize);
+					     desc->folio->index, desc->dtsize);
 		res = nfs_readdir_xdr_to_array(desc, nfsi->cookieverf, verf,
-					       &desc->page, 1);
+					       &desc->folio, 1);
 		if (res < 0) {
-			nfs_readdir_page_unlock_and_put_cached(desc);
+			nfs_readdir_folio_unlock_and_put_cached(desc);
 			trace_nfs_readdir_cache_fill_done(inode, res);
 			if (res == -EBADCOOKIE || res == -ENOTSYNC) {
 				invalidate_inode_pages2(desc->file->f_mapping);
@@ -1059,7 +1057,7 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 	res = nfs_readdir_search_array(desc);
 	if (res == 0)
 		return 0;
-	nfs_readdir_page_unlock_and_put_cached(desc);
+	nfs_readdir_folio_unlock_and_put_cached(desc);
 	return res;
 }
 
@@ -1087,7 +1085,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
 	unsigned int i;
 	bool first_emit = !desc->dir_cookie;
 
-	array = kmap_local_page(desc->page);
+	array = kmap_local_folio(desc->folio, 0);
 	for (i = desc->cache_entry_index; i < array->size; i++) {
 		struct nfs_cache_array_entry *ent;
 
@@ -1114,7 +1112,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
 			break;
 		}
 	}
-	if (array->page_is_eof)
+	if (array->folio_is_eof)
 		desc->eof = !desc->eob;
 
 	kunmap_local(array);
@@ -1136,7 +1134,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
  */
 static int uncached_readdir(struct nfs_readdir_descriptor *desc)
 {
-	struct page	**arrays;
+	struct folio	**arrays;
 	size_t		i, sz = 512;
 	__be32		verf[NFS_DIR_VERIFIER_SIZE];
 	int		status = -ENOMEM;
@@ -1147,14 +1145,14 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
 	arrays = kcalloc(sz, sizeof(*arrays), GFP_KERNEL);
 	if (!arrays)
 		goto out;
-	arrays[0] = nfs_readdir_page_array_alloc(desc->dir_cookie, GFP_KERNEL);
+	arrays[0] = nfs_readdir_folio_array_alloc(desc->dir_cookie, GFP_KERNEL);
 	if (!arrays[0])
 		goto out;
 
-	desc->page_index = 0;
+	desc->folio_index = 0;
 	desc->cache_entry_index = 0;
 	desc->last_cookie = desc->dir_cookie;
-	desc->page_index_max = 0;
+	desc->folio_index_max = 0;
 
 	trace_nfs_readdir_uncached(desc->file, desc->verf, desc->last_cookie,
 				   -1, desc->dtsize);
@@ -1166,10 +1164,10 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
 	}
 
 	for (i = 0; !desc->eob && i < sz && arrays[i]; i++) {
-		desc->page = arrays[i];
+		desc->folio = arrays[i];
 		nfs_do_filldir(desc, verf);
 	}
-	desc->page = NULL;
+	desc->folio = NULL;
 
 	/*
 	 * Grow the dtsize if we have to go back for more pages,
@@ -1179,16 +1177,16 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
 		if (!desc->eob)
 			nfs_grow_dtsize(desc);
 		else if (desc->buffer_fills == 1 &&
-			 i < (desc->page_index_max >> 1))
+			 i < (desc->folio_index_max >> 1))
 			nfs_shrink_dtsize(desc);
 	}
 out_free:
 	for (i = 0; i < sz && arrays[i]; i++)
-		nfs_readdir_page_array_free(arrays[i]);
+		nfs_readdir_folio_array_free(arrays[i]);
 out:
 	if (!nfs_readdir_use_cookie(desc->file))
 		nfs_readdir_rewind_search(desc);
-	desc->page_index_max = -1;
+	desc->folio_index_max = -1;
 	kfree(arrays);
 	dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status);
 	return status;
@@ -1240,11 +1238,11 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 		goto out;
 	desc->file = file;
 	desc->ctx = ctx;
-	desc->page_index_max = -1;
+	desc->folio_index_max = -1;
 
 	spin_lock(&file->f_lock);
 	desc->dir_cookie = dir_ctx->dir_cookie;
-	desc->page_index = dir_ctx->page_index;
+	desc->folio_index = dir_ctx->page_index;
 	desc->last_cookie = dir_ctx->last_cookie;
 	desc->attr_gencount = dir_ctx->attr_gencount;
 	desc->eof = dir_ctx->eof;
@@ -1291,8 +1289,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 			break;
 
 		nfs_do_filldir(desc, nfsi->cookieverf);
-		nfs_readdir_page_unlock_and_put_cached(desc);
-		if (desc->page_index == desc->page_index_max)
+		nfs_readdir_folio_unlock_and_put_cached(desc);
+		if (desc->folio_index == desc->folio_index_max)
 			desc->clear_cache = force_clear;
 	} while (!desc->eob && !desc->eof);
 
@@ -1300,7 +1298,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	dir_ctx->dir_cookie = desc->dir_cookie;
 	dir_ctx->last_cookie = desc->last_cookie;
 	dir_ctx->attr_gencount = desc->attr_gencount;
-	dir_ctx->page_index = desc->page_index;
+	dir_ctx->page_index = desc->folio_index;
 	dir_ctx->force_clear = force_clear;
 	dir_ctx->eof = desc->eof;
 	dir_ctx->dtsize = desc->dtsize;
diff --git a/fs/nfs/export.c b/fs/nfs/export.c
index d6a6d1ebb8fd..be686b8e0c54 100644
--- a/fs/nfs/export.c
+++ b/fs/nfs/export.c
@@ -149,7 +149,10 @@ const struct export_operations nfs_export_ops = {
 	.encode_fh = nfs_encode_fh,
 	.fh_to_dentry = nfs_fh_to_dentry,
 	.get_parent = nfs_get_parent,
-	.flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK|
-		EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS|
-		EXPORT_OP_NOATOMIC_ATTR,
+	.flags = EXPORT_OP_NOWCC		|
+		 EXPORT_OP_NOSUBTREECHK		|
+		 EXPORT_OP_CLOSE_BEFORE_UNLINK	|
+		 EXPORT_OP_REMOTE_FS		|
+		 EXPORT_OP_NOATOMIC_ATTR	|
+		 EXPORT_OP_FLUSH_ON_CLOSE,
 };
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 2474cbc30712..f0edf5a36237 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -328,8 +328,8 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 start:
 	folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, FGP_WRITEBEGIN,
 				    mapping_gfp_mask(mapping));
-	if (!folio)
-		return -ENOMEM;
+	if (IS_ERR(folio))
+		return PTR_ERR(folio);
 	*pagep = &folio->page;
 
 	ret = nfs_flush_incompatible(file, folio);
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index ea5f2976dfab..8c35d88a84b1 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -15,6 +15,9 @@
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/iversion.h>
+#include <linux/xarray.h>
+#include <linux/fscache.h>
+#include <linux/netfs.h>
 
 #include "internal.h"
 #include "iostat.h"
@@ -163,13 +166,14 @@ void nfs_fscache_init_inode(struct inode *inode)
 	struct nfs_server *nfss = NFS_SERVER(inode);
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	nfsi->fscache = NULL;
+	netfs_inode(inode)->cache = NULL;
 	if (!(nfss->fscache && S_ISREG(inode->i_mode)))
 		return;
 
 	nfs_fscache_update_auxdata(&auxdata, inode);
 
-	nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache,
+	netfs_inode(inode)->cache = fscache_acquire_cookie(
+					       nfss->fscache,
 					       0,
 					       nfsi->fh.data, /* index_key */
 					       nfsi->fh.size,
@@ -183,11 +187,8 @@ void nfs_fscache_init_inode(struct inode *inode)
  */
 void nfs_fscache_clear_inode(struct inode *inode)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
-	struct fscache_cookie *cookie = nfs_i_fscache(inode);
-
-	fscache_relinquish_cookie(cookie, false);
-	nfsi->fscache = NULL;
+	fscache_relinquish_cookie(netfs_i_cookie(netfs_inode(inode)), false);
+	netfs_inode(inode)->cache = NULL;
 }
 
 /*
@@ -212,7 +213,7 @@ void nfs_fscache_clear_inode(struct inode *inode)
 void nfs_fscache_open_file(struct inode *inode, struct file *filp)
 {
 	struct nfs_fscache_inode_auxdata auxdata;
-	struct fscache_cookie *cookie = nfs_i_fscache(inode);
+	struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode));
 	bool open_for_write = inode_is_open_for_write(inode);
 
 	if (!fscache_cookie_valid(cookie))
@@ -230,115 +231,160 @@ EXPORT_SYMBOL_GPL(nfs_fscache_open_file);
 void nfs_fscache_release_file(struct inode *inode, struct file *filp)
 {
 	struct nfs_fscache_inode_auxdata auxdata;
-	struct fscache_cookie *cookie = nfs_i_fscache(inode);
+	struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode));
 	loff_t i_size = i_size_read(inode);
 
 	nfs_fscache_update_auxdata(&auxdata, inode);
 	fscache_unuse_cookie(cookie, &auxdata, &i_size);
 }
 
-/*
- * Fallback page reading interface.
- */
-static int fscache_fallback_read_page(struct inode *inode, struct page *page)
+int nfs_netfs_read_folio(struct file *file, struct folio *folio)
 {
-	struct netfs_cache_resources cres;
-	struct fscache_cookie *cookie = nfs_i_fscache(inode);
-	struct iov_iter iter;
-	struct bio_vec bvec;
-	int ret;
-
-	memset(&cres, 0, sizeof(cres));
-	bvec_set_page(&bvec, page, PAGE_SIZE, 0);
-	iov_iter_bvec(&iter, ITER_DEST, &bvec, 1, PAGE_SIZE);
-
-	ret = fscache_begin_read_operation(&cres, cookie);
-	if (ret < 0)
-		return ret;
-
-	ret = fscache_read(&cres, page_offset(page), &iter, NETFS_READ_HOLE_FAIL,
-			   NULL, NULL);
-	fscache_end_operation(&cres);
-	return ret;
+	if (!netfs_inode(folio_inode(folio))->cache)
+		return -ENOBUFS;
+
+	return netfs_read_folio(file, folio);
 }
 
-/*
- * Fallback page writing interface.
- */
-static int fscache_fallback_write_page(struct inode *inode, struct page *page,
-				       bool no_space_allocated_yet)
+int nfs_netfs_readahead(struct readahead_control *ractl)
 {
-	struct netfs_cache_resources cres;
-	struct fscache_cookie *cookie = nfs_i_fscache(inode);
-	struct iov_iter iter;
-	struct bio_vec bvec;
-	loff_t start = page_offset(page);
-	size_t len = PAGE_SIZE;
-	int ret;
-
-	memset(&cres, 0, sizeof(cres));
-	bvec_set_page(&bvec, page, PAGE_SIZE, 0);
-	iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE);
-
-	ret = fscache_begin_write_operation(&cres, cookie);
-	if (ret < 0)
-		return ret;
-
-	ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode),
-				      no_space_allocated_yet);
-	if (ret == 0)
-		ret = fscache_write(&cres, page_offset(page), &iter, NULL, NULL);
-	fscache_end_operation(&cres);
-	return ret;
+	struct inode *inode = ractl->mapping->host;
+
+	if (!netfs_inode(inode)->cache)
+		return -ENOBUFS;
+
+	netfs_readahead(ractl);
+	return 0;
 }
 
-/*
- * Retrieve a page from fscache
- */
-int __nfs_fscache_read_page(struct inode *inode, struct page *page)
+static atomic_t nfs_netfs_debug_id;
+static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *file)
 {
-	int ret;
+	rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file));
+	rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id);
 
-	trace_nfs_fscache_read_page(inode, page);
-	if (PageChecked(page)) {
-		ClearPageChecked(page);
-		ret = 1;
-		goto out;
-	}
+	return 0;
+}
 
-	ret = fscache_fallback_read_page(inode, page);
-	if (ret < 0) {
-		nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL);
-		SetPageChecked(page);
-		goto out;
-	}
+static void nfs_netfs_free_request(struct netfs_io_request *rreq)
+{
+	put_nfs_open_context(rreq->netfs_priv);
+}
 
-	/* Read completed synchronously */
-	nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK);
-	SetPageUptodate(page);
-	ret = 0;
-out:
-	trace_nfs_fscache_read_page_exit(inode, page, ret);
-	return ret;
+static inline int nfs_netfs_begin_cache_operation(struct netfs_io_request *rreq)
+{
+	return fscache_begin_read_operation(&rreq->cache_resources,
+					    netfs_i_cookie(netfs_inode(rreq->inode)));
 }
 
-/*
- * Store a newly fetched page in fscache.  We can be certain there's no page
- * stored in the cache as yet otherwise we would've read it from there.
- */
-void __nfs_fscache_write_page(struct inode *inode, struct page *page)
+static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq)
 {
-	int ret;
+	struct nfs_netfs_io_data *netfs;
+
+	netfs = kzalloc(sizeof(*netfs), GFP_KERNEL_ACCOUNT);
+	if (!netfs)
+		return NULL;
+	netfs->sreq = sreq;
+	refcount_set(&netfs->refcount, 1);
+	return netfs;
+}
 
-	trace_nfs_fscache_write_page(inode, page);
+static bool nfs_netfs_clamp_length(struct netfs_io_subrequest *sreq)
+{
+	size_t	rsize = NFS_SB(sreq->rreq->inode->i_sb)->rsize;
 
-	ret = fscache_fallback_write_page(inode, page, true);
+	sreq->len = min(sreq->len, rsize);
+	return true;
+}
 
-	if (ret != 0) {
-		nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL);
-		nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED);
-	} else {
-		nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_OK);
+static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq)
+{
+	struct nfs_netfs_io_data	*netfs;
+	struct nfs_pageio_descriptor	pgio;
+	struct inode *inode = sreq->rreq->inode;
+	struct nfs_open_context *ctx = sreq->rreq->netfs_priv;
+	struct page *page;
+	int err;
+	pgoff_t start = (sreq->start + sreq->transferred) >> PAGE_SHIFT;
+	pgoff_t last = ((sreq->start + sreq->len -
+			 sreq->transferred - 1) >> PAGE_SHIFT);
+	XA_STATE(xas, &sreq->rreq->mapping->i_pages, start);
+
+	nfs_pageio_init_read(&pgio, inode, false,
+			     &nfs_async_read_completion_ops);
+
+	netfs = nfs_netfs_alloc(sreq);
+	if (!netfs)
+		return netfs_subreq_terminated(sreq, -ENOMEM, false);
+
+	pgio.pg_netfs = netfs; /* used in completion */
+
+	xas_lock(&xas);
+	xas_for_each(&xas, page, last) {
+		/* nfs_read_add_folio() may schedule() due to pNFS layout and other RPCs  */
+		xas_pause(&xas);
+		xas_unlock(&xas);
+		err = nfs_read_add_folio(&pgio, ctx, page_folio(page));
+		if (err < 0) {
+			netfs->error = err;
+			goto out;
+		}
+		xas_lock(&xas);
 	}
-	trace_nfs_fscache_write_page_exit(inode, page, ret);
+	xas_unlock(&xas);
+out:
+	nfs_pageio_complete_read(&pgio);
+	nfs_netfs_put(netfs);
+}
+
+void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr)
+{
+	struct nfs_netfs_io_data        *netfs = hdr->netfs;
+
+	if (!netfs)
+		return;
+
+	nfs_netfs_get(netfs);
+}
+
+int nfs_netfs_folio_unlock(struct folio *folio)
+{
+	struct inode *inode = folio_file_mapping(folio)->host;
+
+	/*
+	 * If fscache is enabled, netfs will unlock pages.
+	 */
+	if (netfs_inode(inode)->cache)
+		return 0;
+
+	return 1;
 }
+
+void nfs_netfs_read_completion(struct nfs_pgio_header *hdr)
+{
+	struct nfs_netfs_io_data        *netfs = hdr->netfs;
+	struct netfs_io_subrequest      *sreq;
+
+	if (!netfs)
+		return;
+
+	sreq = netfs->sreq;
+	if (test_bit(NFS_IOHDR_EOF, &hdr->flags))
+		__set_bit(NETFS_SREQ_CLEAR_TAIL, &sreq->flags);
+
+	if (hdr->error)
+		netfs->error = hdr->error;
+	else
+		atomic64_add(hdr->res.count, &netfs->transferred);
+
+	nfs_netfs_put(netfs);
+	hdr->netfs = NULL;
+}
+
+const struct netfs_request_ops nfs_netfs_ops = {
+	.init_request		= nfs_netfs_init_request,
+	.free_request		= nfs_netfs_free_request,
+	.begin_cache_operation	= nfs_netfs_begin_cache_operation,
+	.issue_read		= nfs_netfs_issue_read,
+	.clamp_length		= nfs_netfs_clamp_length
+};
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 2a37af880978..e1706e736c64 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -34,6 +34,58 @@ struct nfs_fscache_inode_auxdata {
 	u64	change_attr;
 };
 
+struct nfs_netfs_io_data {
+	/*
+	 * NFS may split a netfs_io_subrequest into multiple RPCs, each
+	 * with their own read completion.  In netfs, we can only call
+	 * netfs_subreq_terminated() once for each subrequest.  Use the
+	 * refcount here to double as a marker of the last RPC completion,
+	 * and only call netfs via netfs_subreq_terminated() once.
+	 */
+	refcount_t			refcount;
+	struct netfs_io_subrequest	*sreq;
+
+	/*
+	 * Final disposition of the netfs_io_subrequest, sent in
+	 * netfs_subreq_terminated()
+	 */
+	atomic64_t	transferred;
+	int		error;
+};
+
+static inline void nfs_netfs_get(struct nfs_netfs_io_data *netfs)
+{
+	refcount_inc(&netfs->refcount);
+}
+
+static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs)
+{
+	ssize_t final_len;
+
+	/* Only the last RPC completion should call netfs_subreq_terminated() */
+	if (!refcount_dec_and_test(&netfs->refcount))
+		return;
+
+	/*
+	 * The NFS pageio interface may read a complete page, even when netfs
+	 * only asked for a partial page.  Specifically, this may be seen when
+	 * one thread is truncating a file while another one is reading the last
+	 * page of the file.
+	 * Correct the final length here to be no larger than the netfs subrequest
+	 * length, and thus avoid netfs's "Subreq overread" warning message.
+	 */
+	final_len = min_t(s64, netfs->sreq->len, atomic64_read(&netfs->transferred));
+	netfs_subreq_terminated(netfs->sreq, netfs->error ?: final_len, false);
+	kfree(netfs);
+}
+static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi)
+{
+	netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops);
+}
+extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr);
+extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr);
+extern int nfs_netfs_folio_unlock(struct folio *folio);
+
 /*
  * fscache.c
  */
@@ -44,9 +96,8 @@ extern void nfs_fscache_init_inode(struct inode *);
 extern void nfs_fscache_clear_inode(struct inode *);
 extern void nfs_fscache_open_file(struct inode *, struct file *);
 extern void nfs_fscache_release_file(struct inode *, struct file *);
-
-extern int __nfs_fscache_read_page(struct inode *, struct page *);
-extern void __nfs_fscache_write_page(struct inode *, struct page *);
+extern int nfs_netfs_readahead(struct readahead_control *ractl);
+extern int nfs_netfs_read_folio(struct file *file, struct folio *folio);
 
 static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
 {
@@ -54,34 +105,11 @@ static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
 		if (current_is_kswapd() || !(gfp & __GFP_FS))
 			return false;
 		folio_wait_fscache(folio);
-		fscache_note_page_release(nfs_i_fscache(folio->mapping->host));
-		nfs_inc_fscache_stats(folio->mapping->host,
-				      NFSIOS_FSCACHE_PAGES_UNCACHED);
 	}
+	fscache_note_page_release(netfs_i_cookie(netfs_inode(folio->mapping->host)));
 	return true;
 }
 
-/*
- * Retrieve a page from an inode data storage object.
- */
-static inline int nfs_fscache_read_page(struct inode *inode, struct page *page)
-{
-	if (nfs_i_fscache(inode))
-		return __nfs_fscache_read_page(inode, page);
-	return -ENOBUFS;
-}
-
-/*
- * Store a page newly fetched from the server in an inode data storage object
- * in the cache.
- */
-static inline void nfs_fscache_write_page(struct inode *inode,
-					   struct page *page)
-{
-	if (nfs_i_fscache(inode))
-		__nfs_fscache_write_page(inode, page);
-}
-
 static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *auxdata,
 					      struct inode *inode)
 {
@@ -101,13 +129,10 @@ static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *
 static inline void nfs_fscache_invalidate(struct inode *inode, int flags)
 {
 	struct nfs_fscache_inode_auxdata auxdata;
-	struct nfs_inode *nfsi = NFS_I(inode);
+	struct fscache_cookie *cookie =  netfs_i_cookie(&NFS_I(inode)->netfs);
 
-	if (nfsi->fscache) {
-		nfs_fscache_update_auxdata(&auxdata, inode);
-		fscache_invalidate(nfsi->fscache, &auxdata,
-				   i_size_read(inode), flags);
-	}
+	nfs_fscache_update_auxdata(&auxdata, inode);
+	fscache_invalidate(cookie, &auxdata, i_size_read(inode), flags);
 }
 
 /*
@@ -120,7 +145,28 @@ static inline const char *nfs_server_fscache_state(struct nfs_server *server)
 	return "no ";
 }
 
+static inline void nfs_netfs_set_pgio_header(struct nfs_pgio_header *hdr,
+					     struct nfs_pageio_descriptor *desc)
+{
+	hdr->netfs = desc->pg_netfs;
+}
+static inline void nfs_netfs_set_pageio_descriptor(struct nfs_pageio_descriptor *desc,
+						   struct nfs_pgio_header *hdr)
+{
+	desc->pg_netfs = hdr->netfs;
+}
+static inline void nfs_netfs_reset_pageio_descriptor(struct nfs_pageio_descriptor *desc)
+{
+	desc->pg_netfs = NULL;
+}
 #else /* CONFIG_NFS_FSCACHE */
+static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
+static inline void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr) {}
+static inline void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) {}
+static inline int nfs_netfs_folio_unlock(struct folio *folio)
+{
+	return 1;
+}
 static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
 
 static inline void nfs_fscache_init_inode(struct inode *inode) {}
@@ -128,22 +174,29 @@ static inline void nfs_fscache_clear_inode(struct inode *inode) {}
 static inline void nfs_fscache_open_file(struct inode *inode,
 					 struct file *filp) {}
 static inline void nfs_fscache_release_file(struct inode *inode, struct file *file) {}
-
-static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
+static inline int nfs_netfs_readahead(struct readahead_control *ractl)
 {
-	return true; /* may release folio */
+	return -ENOBUFS;
 }
-static inline int nfs_fscache_read_page(struct inode *inode, struct page *page)
+static inline int nfs_netfs_read_folio(struct file *file, struct folio *folio)
 {
 	return -ENOBUFS;
 }
-static inline void nfs_fscache_write_page(struct inode *inode, struct page *page) {}
+
+static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
+{
+	return true; /* may release folio */
+}
 static inline void nfs_fscache_invalidate(struct inode *inode, int flags) {}
 
 static inline const char *nfs_server_fscache_state(struct nfs_server *server)
 {
 	return "no ";
 }
-
+static inline void nfs_netfs_set_pgio_header(struct nfs_pgio_header *hdr,
+					     struct nfs_pageio_descriptor *desc) {}
+static inline void nfs_netfs_set_pageio_descriptor(struct nfs_pageio_descriptor *desc,
+						   struct nfs_pgio_header *hdr) {}
+static inline void nfs_netfs_reset_pageio_descriptor(struct nfs_pageio_descriptor *desc) {}
 #endif /* CONFIG_NFS_FSCACHE */
 #endif /* _NFS_FSCACHE_H */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 97a76706fd54..a910b9a638c5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -208,11 +208,12 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
 
 	nfsi->cache_validity |= flags;
 
-	if (inode->i_mapping->nrpages == 0)
-		nfsi->cache_validity &= ~(NFS_INO_INVALID_DATA |
-					  NFS_INO_DATA_INVAL_DEFER);
-	else if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
-		nfsi->cache_validity &= ~NFS_INO_DATA_INVAL_DEFER;
+	if (inode->i_mapping->nrpages == 0) {
+		nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+		nfs_ooo_clear(nfsi);
+	} else if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
+		nfs_ooo_clear(nfsi);
+	}
 	trace_nfs_set_cache_invalid(inode, 0);
 }
 EXPORT_SYMBOL_GPL(nfs_set_cache_invalid);
@@ -677,9 +678,10 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset)
 	trace_nfs_size_truncate(inode, offset);
 	i_size_write(inode, offset);
 	/* Optimisation */
-	if (offset == 0)
-		NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_DATA |
-				NFS_INO_DATA_INVAL_DEFER);
+	if (offset == 0) {
+		NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA;
+		nfs_ooo_clear(NFS_I(inode));
+	}
 	NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE;
 
 	spin_unlock(&inode->i_lock);
@@ -1107,7 +1109,7 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
 
 	spin_lock(&inode->i_lock);
 	if (list_empty(&nfsi->open_files) &&
-	    (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
+	    nfs_ooo_test(nfsi))
 		nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA |
 						     NFS_INO_REVAL_FORCED);
 	list_add_tail_rcu(&ctx->list, &nfsi->open_files);
@@ -1351,8 +1353,8 @@ int nfs_clear_invalid_mapping(struct address_space *mapping)
 
 	set_bit(NFS_INO_INVALIDATING, bitlock);
 	smp_wmb();
-	nfsi->cache_validity &=
-		~(NFS_INO_INVALID_DATA | NFS_INO_DATA_INVAL_DEFER);
+	nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+	nfs_ooo_clear(nfsi);
 	spin_unlock(&inode->i_lock);
 	trace_nfs_invalidate_mapping_enter(inode);
 	ret = nfs_invalidate_mapping(inode, mapping);
@@ -1814,6 +1816,66 @@ static int nfs_inode_finish_partial_attr_update(const struct nfs_fattr *fattr,
 	return 0;
 }
 
+static void nfs_ooo_merge(struct nfs_inode *nfsi,
+			  u64 start, u64 end)
+{
+	int i, cnt;
+
+	if (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)
+		/* No point merging anything */
+		return;
+
+	if (!nfsi->ooo) {
+		nfsi->ooo = kmalloc(sizeof(*nfsi->ooo), GFP_ATOMIC);
+		if (!nfsi->ooo) {
+			nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER;
+			return;
+		}
+		nfsi->ooo->cnt = 0;
+	}
+
+	/* add this range, merging if possible */
+	cnt = nfsi->ooo->cnt;
+	for (i = 0; i < cnt; i++) {
+		if (end == nfsi->ooo->gap[i].start)
+			end = nfsi->ooo->gap[i].end;
+		else if (start == nfsi->ooo->gap[i].end)
+			start = nfsi->ooo->gap[i].start;
+		else
+			continue;
+		/* Remove 'i' from table and loop to insert the new range */
+		cnt -= 1;
+		nfsi->ooo->gap[i] = nfsi->ooo->gap[cnt];
+		i = -1;
+	}
+	if (start != end) {
+		if (cnt >= ARRAY_SIZE(nfsi->ooo->gap)) {
+			nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER;
+			kfree(nfsi->ooo);
+			nfsi->ooo = NULL;
+			return;
+		}
+		nfsi->ooo->gap[cnt].start = start;
+		nfsi->ooo->gap[cnt].end = end;
+		cnt += 1;
+	}
+	nfsi->ooo->cnt = cnt;
+}
+
+static void nfs_ooo_record(struct nfs_inode *nfsi,
+			   struct nfs_fattr *fattr)
+{
+	/* This reply was out-of-order, so record in the
+	 * pre/post change id, possibly cancelling
+	 * gaps created when iversion was jumpped forward.
+	 */
+	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) &&
+	    (fattr->valid & NFS_ATTR_FATTR_PRECHANGE))
+		nfs_ooo_merge(nfsi,
+			      fattr->change_attr,
+			      fattr->pre_change_attr);
+}
+
 static int nfs_refresh_inode_locked(struct inode *inode,
 				    struct nfs_fattr *fattr)
 {
@@ -1824,8 +1886,12 @@ static int nfs_refresh_inode_locked(struct inode *inode,
 
 	if (attr_cmp > 0 || nfs_inode_finish_partial_attr_update(fattr, inode))
 		ret = nfs_update_inode(inode, fattr);
-	else if (attr_cmp == 0)
-		ret = nfs_check_inode_attributes(inode, fattr);
+	else {
+		nfs_ooo_record(NFS_I(inode), fattr);
+
+		if (attr_cmp == 0)
+			ret = nfs_check_inode_attributes(inode, fattr);
+	}
 
 	trace_nfs_refresh_inode_exit(inode, ret);
 	return ret;
@@ -1916,6 +1982,8 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa
 	if (attr_cmp < 0)
 		return 0;
 	if ((fattr->valid & NFS_ATTR_FATTR) == 0 || !attr_cmp) {
+		/* Record the pre/post change info before clearing PRECHANGE */
+		nfs_ooo_record(NFS_I(inode), fattr);
 		fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE
 				| NFS_ATTR_FATTR_PRESIZE
 				| NFS_ATTR_FATTR_PREMTIME
@@ -2070,6 +2138,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 
 	/* More cache consistency checks */
 	if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
+		if (!have_writers && nfsi->ooo && nfsi->ooo->cnt == 1 &&
+		    nfsi->ooo->gap[0].end == inode_peek_iversion_raw(inode)) {
+			/* There is one remaining gap that hasn't been
+			 * merged into iversion - do that now.
+			 */
+			inode_set_iversion_raw(inode, nfsi->ooo->gap[0].start);
+			kfree(nfsi->ooo);
+			nfsi->ooo = NULL;
+		}
 		if (!inode_eq_iversion_raw(inode, fattr->change_attr)) {
 			/* Could it be a race with writeback? */
 			if (!(have_writers || have_delegation)) {
@@ -2091,8 +2168,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				dprintk("NFS: change_attr change on server for file %s/%ld\n",
 						inode->i_sb->s_id,
 						inode->i_ino);
-			} else if (!have_delegation)
-				nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER;
+			} else if (!have_delegation) {
+				nfs_ooo_record(nfsi, fattr);
+				nfs_ooo_merge(nfsi, inode_peek_iversion_raw(inode),
+					      fattr->change_attr);
+			}
 			inode_set_iversion_raw(inode, fattr->change_attr);
 		}
 	} else {
@@ -2246,18 +2326,22 @@ struct inode *nfs_alloc_inode(struct super_block *sb)
 		return NULL;
 	nfsi->flags = 0UL;
 	nfsi->cache_validity = 0UL;
+	nfsi->ooo = NULL;
 #if IS_ENABLED(CONFIG_NFS_V4)
 	nfsi->nfs4_acl = NULL;
 #endif /* CONFIG_NFS_V4 */
 #ifdef CONFIG_NFS_V4_2
 	nfsi->xattr_cache = NULL;
 #endif
+	nfs_netfs_inode_init(nfsi);
+
 	return &nfsi->vfs_inode;
 }
 EXPORT_SYMBOL_GPL(nfs_alloc_inode);
 
 void nfs_free_inode(struct inode *inode)
 {
+	kfree(NFS_I(inode)->ooo);
 	kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
 }
 EXPORT_SYMBOL_GPL(nfs_free_inode);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2a65fe2a63ab..3cc027d3bd58 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -452,6 +452,10 @@ extern void nfs_sb_deactive(struct super_block *sb);
 extern int nfs_client_for_each_server(struct nfs_client *clp,
 				      int (*fn)(struct nfs_server *, void *),
 				      void *data);
+#ifdef CONFIG_NFS_FSCACHE
+extern const struct netfs_request_ops nfs_netfs_ops;
+#endif
+
 /* io.c */
 extern void nfs_start_io_read(struct inode *inode);
 extern void nfs_end_io_read(struct inode *inode);
@@ -481,9 +485,14 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool
 
 struct nfs_pgio_completion_ops;
 /* read.c */
+extern const struct nfs_pgio_completion_ops nfs_async_read_completion_ops;
 extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
 			struct inode *inode, bool force_mds,
 			const struct nfs_pgio_completion_ops *compl_ops);
+extern int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio,
+			       struct nfs_open_context *ctx,
+			       struct folio *folio);
+extern void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio);
 extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
 
@@ -846,27 +855,12 @@ u64 nfs_timespec_to_change_attr(const struct timespec64 *ts)
 }
 
 #ifdef CONFIG_CRC32
-/**
- * nfs_fhandle_hash - calculate the crc32 hash for the filehandle
- * @fh - pointer to filehandle
- *
- * returns a crc32 hash for the filehandle that is compatible with
- * the one displayed by "wireshark".
- */
-static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
-{
-	return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size);
-}
 static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid)
 {
 	return ~crc32_le(0xFFFFFFFF, &stateid->other[0],
 				NFS4_STATEID_OTHER_SIZE);
 }
 #else
-static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
-{
-	return 0;
-}
 static inline u32 nfs_stateid_hash(nfs4_stateid *stateid)
 {
 	return 0;
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 2ddaab1ac653..5aa776b5a3e7 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -17,9 +17,6 @@
 
 struct nfs_iostats {
 	unsigned long long	bytes[__NFSIOS_BYTESMAX];
-#ifdef CONFIG_NFS_FSCACHE
-	unsigned long long	fscache[__NFSIOS_FSCACHEMAX];
-#endif
 	unsigned long		events[__NFSIOS_COUNTSMAX];
 } ____cacheline_aligned;
 
@@ -49,20 +46,6 @@ static inline void nfs_add_stats(const struct inode *inode,
 	nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
 }
 
-#ifdef CONFIG_NFS_FSCACHE
-static inline void nfs_add_fscache_stats(struct inode *inode,
-					 enum nfs_stat_fscachecounters stat,
-					 long addend)
-{
-	this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend);
-}
-static inline void nfs_inc_fscache_stats(struct inode *inode,
-					 enum nfs_stat_fscachecounters stat)
-{
-	this_cpu_inc(NFS_SERVER(inode)->io_stats->fscache[stat]);
-}
-#endif
-
 static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void)
 {
 	return alloc_percpu(struct nfs_iostats);
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 349cc4f60a28..18d8f6529f61 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -21,9 +21,8 @@ static void nfs3_prepare_get_acl(struct posix_acl **p)
 {
 	struct posix_acl *sentinel = uncached_acl_sentinel(current);
 
-	if (cmpxchg(p, ACL_NOT_CACHED, sentinel) != ACL_NOT_CACHED) {
-		/* Not the first reader or sentinel already in place. */
-	}
+	/* If the ACL isn't being read yet, set our sentinel. */
+	cmpxchg(p, ACL_NOT_CACHED, sentinel);
 }
 
 static void nfs3_complete_get_acl(struct posix_acl **p, struct posix_acl *acl)
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index d80ee88ca996..a6df815a140c 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -1122,7 +1122,6 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
 	uint32_t segments;
 	struct read_plus_segment *segs;
 	int status, i;
-	char scratch_buf[16];
 	__be32 *p;
 
 	status = decode_op_hdr(xdr, OP_READ_PLUS);
@@ -1143,7 +1142,6 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
 	if (!segs)
 		return -ENOMEM;
 
-	xdr_set_scratch_buffer(xdr, &scratch_buf, sizeof(scratch_buf));
 	status = -EIO;
 	for (i = 0; i < segments; i++) {
 		status = decode_read_plus_segment(xdr, &segs[i]);
@@ -1348,6 +1346,8 @@ static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp,
 	struct compound_hdr hdr;
 	int status;
 
+	xdr_set_scratch_buffer(xdr, res->scratch, sizeof(res->scratch));
+
 	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5607b1e2b821..18f25ff4bff7 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5439,6 +5439,8 @@ static bool nfs4_read_plus_not_supported(struct rpc_task *task,
 
 static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
+	if (hdr->res.scratch)
+		kfree(hdr->res.scratch);
 	if (!nfs4_sequence_done(task, &hdr->res.seq_res))
 		return -EAGAIN;
 	if (nfs4_read_stateid_changed(task, &hdr->args))
@@ -5452,17 +5454,22 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 }
 
 #if defined CONFIG_NFS_V4_2 && defined CONFIG_NFS_V4_2_READ_PLUS
-static void nfs42_read_plus_support(struct nfs_pgio_header *hdr,
+static bool nfs42_read_plus_support(struct nfs_pgio_header *hdr,
 				    struct rpc_message *msg)
 {
 	/* Note: We don't use READ_PLUS with pNFS yet */
-	if (nfs_server_capable(hdr->inode, NFS_CAP_READ_PLUS) && !hdr->ds_clp)
+	if (nfs_server_capable(hdr->inode, NFS_CAP_READ_PLUS) && !hdr->ds_clp) {
 		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS];
+		hdr->res.scratch = kmalloc(32, GFP_KERNEL);
+		return hdr->res.scratch != NULL;
+	}
+	return false;
 }
 #else
-static void nfs42_read_plus_support(struct nfs_pgio_header *hdr,
+static bool nfs42_read_plus_support(struct nfs_pgio_header *hdr,
 				    struct rpc_message *msg)
 {
+	return false;
 }
 #endif /* CONFIG_NFS_V4_2 */
 
@@ -5472,8 +5479,8 @@ static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
 	hdr->timestamp   = jiffies;
 	if (!hdr->pgio_done_cb)
 		hdr->pgio_done_cb = nfs4_read_done_cb;
-	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
-	nfs42_read_plus_support(hdr, msg);
+	if (!nfs42_read_plus_support(hdr, msg))
+		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
 	nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0);
 }
 
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2a0ca5c7f082..bbe49315d99e 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -67,6 +67,8 @@
 
 #define OPENOWNER_POOL_SIZE	8
 
+static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp);
+
 const nfs4_stateid zero_stateid = {
 	{ .data = { 0 } },
 	.type = NFS4_SPECIAL_STATEID_TYPE,
@@ -330,6 +332,8 @@ do_confirm:
 	status = nfs4_proc_create_session(clp, cred);
 	if (status != 0)
 		goto out;
+	if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_CONFIRMED_R))
+		nfs4_state_start_reclaim_reboot(clp);
 	nfs41_finish_session_reset(clp);
 	nfs_mark_client_ready(clp, NFS_CS_READY);
 out:
@@ -1205,10 +1209,6 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
 {
 	struct task_struct *task;
 	char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1];
-	struct rpc_clnt *cl = clp->cl_rpcclient;
-
-	while (cl != cl->cl_parent)
-		cl = cl->cl_parent;
 
 	set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
 	if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) {
diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c
index c394e4447100..e776200e9a11 100644
--- a/fs/nfs/nfs4sysctl.c
+++ b/fs/nfs/nfs4sysctl.c
@@ -37,27 +37,10 @@ static struct ctl_table nfs4_cb_sysctls[] = {
 	{ }
 };
 
-static struct ctl_table nfs4_cb_sysctl_dir[] = {
-	{
-		.procname = "nfs",
-		.mode = 0555,
-		.child = nfs4_cb_sysctls,
-	},
-	{ }
-};
-
-static struct ctl_table nfs4_cb_sysctl_root[] = {
-	{
-		.procname = "fs",
-		.mode = 0555,
-		.child = nfs4_cb_sysctl_dir,
-	},
-	{ }
-};
-
 int nfs4_register_sysctl(void)
 {
-	nfs4_callback_sysctl_table = register_sysctl_table(nfs4_cb_sysctl_root);
+	nfs4_callback_sysctl_table = register_sysctl("fs/nfs",
+						     nfs4_cb_sysctls);
 	if (nfs4_callback_sysctl_table == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index a778713343df..4e90ca531176 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -39,7 +39,6 @@
 			{ BIT(NFS_INO_STALE), "STALE" }, \
 			{ BIT(NFS_INO_ACL_LRU_SET), "ACL_LRU_SET" }, \
 			{ BIT(NFS_INO_INVALIDATING), "INVALIDATING" }, \
-			{ BIT(NFS_INO_FSCACHE), "FSCACHE" }, \
 			{ BIT(NFS_INO_LAYOUTCOMMIT), "NEED_LAYOUTCOMMIT" }, \
 			{ BIT(NFS_INO_LAYOUTCOMMITTING), "LAYOUTCOMMIT" }, \
 			{ BIT(NFS_INO_LAYOUTSTATS), "LAYOUTSTATS" }, \
@@ -1243,96 +1242,6 @@ TRACE_EVENT(nfs_readpage_short,
 		)
 );
 
-DECLARE_EVENT_CLASS(nfs_fscache_page_event,
-		TP_PROTO(
-			const struct inode *inode,
-			struct page *page
-		),
-
-		TP_ARGS(inode, page),
-
-		TP_STRUCT__entry(
-			__field(dev_t, dev)
-			__field(u32, fhandle)
-			__field(u64, fileid)
-			__field(loff_t, offset)
-		),
-
-		TP_fast_assign(
-			const struct nfs_inode *nfsi = NFS_I(inode);
-			const struct nfs_fh *fh = &nfsi->fh;
-
-			__entry->offset = page_index(page) << PAGE_SHIFT;
-			__entry->dev = inode->i_sb->s_dev;
-			__entry->fileid = nfsi->fileid;
-			__entry->fhandle = nfs_fhandle_hash(fh);
-		),
-
-		TP_printk(
-			"fileid=%02x:%02x:%llu fhandle=0x%08x "
-			"offset=%lld",
-			MAJOR(__entry->dev), MINOR(__entry->dev),
-			(unsigned long long)__entry->fileid,
-			__entry->fhandle,
-			(long long)__entry->offset
-		)
-);
-DECLARE_EVENT_CLASS(nfs_fscache_page_event_done,
-		TP_PROTO(
-			const struct inode *inode,
-			struct page *page,
-			int error
-		),
-
-		TP_ARGS(inode, page, error),
-
-		TP_STRUCT__entry(
-			__field(int, error)
-			__field(dev_t, dev)
-			__field(u32, fhandle)
-			__field(u64, fileid)
-			__field(loff_t, offset)
-		),
-
-		TP_fast_assign(
-			const struct nfs_inode *nfsi = NFS_I(inode);
-			const struct nfs_fh *fh = &nfsi->fh;
-
-			__entry->offset = page_index(page) << PAGE_SHIFT;
-			__entry->dev = inode->i_sb->s_dev;
-			__entry->fileid = nfsi->fileid;
-			__entry->fhandle = nfs_fhandle_hash(fh);
-			__entry->error = error;
-		),
-
-		TP_printk(
-			"fileid=%02x:%02x:%llu fhandle=0x%08x "
-			"offset=%lld error=%d",
-			MAJOR(__entry->dev), MINOR(__entry->dev),
-			(unsigned long long)__entry->fileid,
-			__entry->fhandle,
-			(long long)__entry->offset, __entry->error
-		)
-);
-#define DEFINE_NFS_FSCACHE_PAGE_EVENT(name) \
-	DEFINE_EVENT(nfs_fscache_page_event, name, \
-			TP_PROTO( \
-				const struct inode *inode, \
-				struct page *page \
-			), \
-			TP_ARGS(inode, page))
-#define DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(name) \
-	DEFINE_EVENT(nfs_fscache_page_event_done, name, \
-			TP_PROTO( \
-				const struct inode *inode, \
-				struct page *page, \
-				int error \
-			), \
-			TP_ARGS(inode, page, error))
-DEFINE_NFS_FSCACHE_PAGE_EVENT(nfs_fscache_read_page);
-DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(nfs_fscache_read_page_exit);
-DEFINE_NFS_FSCACHE_PAGE_EVENT(nfs_fscache_write_page);
-DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(nfs_fscache_write_page_exit);
 
 TRACE_EVENT(nfs_pgio_error,
 	TP_PROTO(
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 64fa8de199de..6efb5068c116 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -26,6 +26,7 @@
 #include "internal.h"
 #include "pnfs.h"
 #include "nfstrace.h"
+#include "fscache.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 
@@ -105,6 +106,7 @@ void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
 	hdr->good_bytes = mirror->pg_count;
 	hdr->io_completion = desc->pg_io_completion;
 	hdr->dreq = desc->pg_dreq;
+	nfs_netfs_set_pgio_header(hdr, desc);
 	hdr->release = release;
 	hdr->completion_ops = desc->pg_completion_ops;
 	if (hdr->completion_ops->init_hdr)
@@ -941,6 +943,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 	desc->pg_lseg = NULL;
 	desc->pg_io_completion = NULL;
 	desc->pg_dreq = NULL;
+	nfs_netfs_reset_pageio_descriptor(desc);
 	desc->pg_bsize = bsize;
 
 	desc->pg_mirror_count = 1;
@@ -1477,6 +1480,7 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
 
 	desc->pg_io_completion = hdr->io_completion;
 	desc->pg_dreq = hdr->dreq;
+	nfs_netfs_set_pageio_descriptor(desc, hdr);
 	list_splice_init(&hdr->pages, &pages);
 	while (!list_empty(&pages)) {
 		struct nfs_page *req = nfs_list_entry(pages.next);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index e90988591df4..f71eeee67e20 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -31,7 +31,7 @@
 
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 
-static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops;
+const struct nfs_pgio_completion_ops nfs_async_read_completion_ops;
 static const struct nfs_rw_ops nfs_rw_read_ops;
 
 static struct kmem_cache *nfs_rdata_cachep;
@@ -74,7 +74,7 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
 
-static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio)
+void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio)
 {
 	struct nfs_pgio_mirror *pgm;
 	unsigned long npages;
@@ -110,28 +110,17 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
 
 static void nfs_readpage_release(struct nfs_page *req, int error)
 {
-	struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
 	struct folio *folio = nfs_page_to_folio(req);
 
-	dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id,
-		(unsigned long long)NFS_FILEID(inode), req->wb_bytes,
-		(long long)req_offset(req));
-
 	if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
 		folio_set_error(folio);
-	if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
-		if (folio_test_uptodate(folio))
-			nfs_fscache_write_page(inode, &folio->page);
-		folio_unlock(folio);
-	}
+	if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
+		if (nfs_netfs_folio_unlock(folio))
+			folio_unlock(folio);
+
 	nfs_release_request(req);
 }
 
-struct nfs_readdesc {
-	struct nfs_pageio_descriptor pgio;
-	struct nfs_open_context *ctx;
-};
-
 static void nfs_page_group_set_uptodate(struct nfs_page *req)
 {
 	if (nfs_page_group_sync_on_bit(req, PG_UPTODATE))
@@ -153,7 +142,8 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr)
 
 		if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
 			/* note: regions of the page not covered by a
-			 * request are zeroed in readpage_async_filler */
+			 * request are zeroed in nfs_read_add_folio
+			 */
 			if (bytes > hdr->good_bytes) {
 				/* nothing in this request was good, so zero
 				 * the full extent of the request */
@@ -181,6 +171,8 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr)
 		nfs_list_remove_request(req);
 		nfs_readpage_release(req, error);
 	}
+	nfs_netfs_read_completion(hdr);
+
 out:
 	hdr->release(hdr);
 }
@@ -191,6 +183,7 @@ static void nfs_initiate_read(struct nfs_pgio_header *hdr,
 			      struct rpc_task_setup *task_setup_data, int how)
 {
 	rpc_ops->read_setup(hdr, msg);
+	nfs_netfs_initiate_read(hdr);
 	trace_nfs_initiate_read(hdr);
 }
 
@@ -206,7 +199,7 @@ nfs_async_read_error(struct list_head *head, int error)
 	}
 }
 
-static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {
+const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {
 	.error_cleanup = nfs_async_read_error,
 	.completion = nfs_read_completion,
 };
@@ -281,7 +274,9 @@ static void nfs_readpage_result(struct rpc_task *task,
 		nfs_readpage_retry(task, hdr);
 }
 
-static int readpage_async_filler(struct nfs_readdesc *desc, struct folio *folio)
+int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio,
+		       struct nfs_open_context *ctx,
+		       struct folio *folio)
 {
 	struct inode *inode = folio_file_mapping(folio)->host;
 	struct nfs_server *server = NFS_SERVER(inode);
@@ -297,29 +292,21 @@ static int readpage_async_filler(struct nfs_readdesc *desc, struct folio *folio)
 
 	aligned_len = min_t(unsigned int, ALIGN(len, rsize), fsize);
 
-	if (!IS_SYNC(inode)) {
-		error = nfs_fscache_read_page(inode, &folio->page);
-		if (error == 0)
-			goto out_unlock;
+	new = nfs_page_create_from_folio(ctx, folio, 0, aligned_len);
+	if (IS_ERR(new)) {
+		error = PTR_ERR(new);
+		goto out;
 	}
 
-	new = nfs_page_create_from_folio(desc->ctx, folio, 0, aligned_len);
-	if (IS_ERR(new))
-		goto out_error;
-
 	if (len < fsize)
 		folio_zero_segment(folio, len, fsize);
-	if (!nfs_pageio_add_request(&desc->pgio, new)) {
+	if (!nfs_pageio_add_request(pgio, new)) {
 		nfs_list_remove_request(new);
-		error = desc->pgio.pg_error;
+		error = pgio->pg_error;
 		nfs_readpage_release(new, error);
 		goto out;
 	}
 	return 0;
-out_error:
-	error = PTR_ERR(new);
-out_unlock:
-	folio_unlock(folio);
 out:
 	return error;
 }
@@ -332,8 +319,9 @@ out:
  */
 int nfs_read_folio(struct file *file, struct folio *folio)
 {
-	struct nfs_readdesc desc;
 	struct inode *inode = file_inode(file);
+	struct nfs_pageio_descriptor pgio;
+	struct nfs_open_context *ctx;
 	int ret;
 
 	trace_nfs_aop_readpage(inode, folio);
@@ -357,38 +345,43 @@ int nfs_read_folio(struct file *file, struct folio *folio)
 	if (NFS_STALE(inode))
 		goto out_unlock;
 
-	desc.ctx = get_nfs_open_context(nfs_file_open_context(file));
+	ret = nfs_netfs_read_folio(file, folio);
+	if (!ret)
+		goto out;
+
+	ctx = get_nfs_open_context(nfs_file_open_context(file));
 
-	xchg(&desc.ctx->error, 0);
-	nfs_pageio_init_read(&desc.pgio, inode, false,
+	xchg(&ctx->error, 0);
+	nfs_pageio_init_read(&pgio, inode, false,
 			     &nfs_async_read_completion_ops);
 
-	ret = readpage_async_filler(&desc, folio);
+	ret = nfs_read_add_folio(&pgio, ctx, folio);
 	if (ret)
-		goto out;
+		goto out_put;
 
-	nfs_pageio_complete_read(&desc.pgio);
-	ret = desc.pgio.pg_error < 0 ? desc.pgio.pg_error : 0;
+	nfs_pageio_complete_read(&pgio);
+	ret = pgio.pg_error < 0 ? pgio.pg_error : 0;
 	if (!ret) {
 		ret = folio_wait_locked_killable(folio);
 		if (!folio_test_uptodate(folio) && !ret)
-			ret = xchg(&desc.ctx->error, 0);
+			ret = xchg(&ctx->error, 0);
 	}
+out_put:
+	put_nfs_open_context(ctx);
 out:
-	put_nfs_open_context(desc.ctx);
 	trace_nfs_aop_readpage_done(inode, folio, ret);
 	return ret;
 out_unlock:
 	folio_unlock(folio);
-	trace_nfs_aop_readpage_done(inode, folio, ret);
-	return ret;
+	goto out;
 }
 
 void nfs_readahead(struct readahead_control *ractl)
 {
+	struct nfs_pageio_descriptor pgio;
+	struct nfs_open_context *ctx;
 	unsigned int nr_pages = readahead_count(ractl);
 	struct file *file = ractl->file;
-	struct nfs_readdesc desc;
 	struct inode *inode = ractl->mapping->host;
 	struct folio *folio;
 	int ret;
@@ -401,26 +394,30 @@ void nfs_readahead(struct readahead_control *ractl)
 	if (NFS_STALE(inode))
 		goto out;
 
+	ret = nfs_netfs_readahead(ractl);
+	if (!ret)
+		goto out;
+
 	if (file == NULL) {
 		ret = -EBADF;
-		desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
-		if (desc.ctx == NULL)
+		ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
+		if (ctx == NULL)
 			goto out;
 	} else
-		desc.ctx = get_nfs_open_context(nfs_file_open_context(file));
+		ctx = get_nfs_open_context(nfs_file_open_context(file));
 
-	nfs_pageio_init_read(&desc.pgio, inode, false,
+	nfs_pageio_init_read(&pgio, inode, false,
 			     &nfs_async_read_completion_ops);
 
 	while ((folio = readahead_folio(ractl)) != NULL) {
-		ret = readpage_async_filler(&desc, folio);
+		ret = nfs_read_add_folio(&pgio, ctx, folio);
 		if (ret)
 			break;
 	}
 
-	nfs_pageio_complete_read(&desc.pgio);
+	nfs_pageio_complete_read(&pgio);
 
-	put_nfs_open_context(desc.ctx);
+	put_nfs_open_context(ctx);
 out:
 	trace_nfs_aop_readahead_done(inode, nr_pages, ret);
 }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 397c096d874e..30e53e93049e 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -692,10 +692,6 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root)
 			totals.events[i] += stats->events[i];
 		for (i = 0; i < __NFSIOS_BYTESMAX; i++)
 			totals.bytes[i] += stats->bytes[i];
-#ifdef CONFIG_NFS_FSCACHE
-		for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
-			totals.fscache[i] += stats->fscache[i];
-#endif
 
 		preempt_enable();
 	}
@@ -706,13 +702,6 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root)
 	seq_puts(m, "\n\tbytes:\t");
 	for (i = 0; i < __NFSIOS_BYTESMAX; i++)
 		seq_printf(m, "%Lu ", totals.bytes[i]);
-#ifdef CONFIG_NFS_FSCACHE
-	if (nfss->options & NFS_OPTION_FSCACHE) {
-		seq_puts(m, "\n\tfsc:\t");
-		for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
-			seq_printf(m, "%Lu ", totals.fscache[i]);
-	}
-#endif
 	seq_putc(m, '\n');
 
 	rpc_clnt_show_stats(m, nfss->client);
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index 7aea195ddb35..f39e2089bc4c 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -32,27 +32,9 @@ static struct ctl_table nfs_cb_sysctls[] = {
 	{ }
 };
 
-static struct ctl_table nfs_cb_sysctl_dir[] = {
-	{
-		.procname = "nfs",
-		.mode = 0555,
-		.child = nfs_cb_sysctls,
-	},
-	{ }
-};
-
-static struct ctl_table nfs_cb_sysctl_root[] = {
-	{
-		.procname = "fs",
-		.mode = 0555,
-		.child = nfs_cb_sysctl_dir,
-	},
-	{ }
-};
-
 int nfs_register_sysctl(void)
 {
-	nfs_callback_sysctl_table = register_sysctl_table(nfs_cb_sysctl_root);
+	nfs_callback_sysctl_table = register_sysctl("fs/nfs", nfs_cb_sysctls);
 	if (nfs_callback_sysctl_table == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/nfs_common/nfs_ssc.c b/fs/nfs_common/nfs_ssc.c
index 7c1509e968c8..832246b22c51 100644
--- a/fs/nfs_common/nfs_ssc.c
+++ b/fs/nfs_common/nfs_ssc.c
@@ -12,7 +12,6 @@
 #include <linux/nfs_ssc.h>
 #include "../nfs/nfs4_fs.h"
 
-MODULE_LICENSE("GPL");
 
 struct nfs_ssc_client_ops_tbl nfs_ssc_client_tbl;
 EXPORT_SYMBOL_GPL(nfs_ssc_client_tbl);
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 668c7527b17e..ae85257b4238 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -123,11 +123,11 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
 
 	/* OK, we seem to have a valid key */
 	key.h.flags = 0;
-	key.h.expiry_time = get_expiry(&mesg);
-	if (key.h.expiry_time == 0)
+	err = get_expiry(&mesg, &key.h.expiry_time);
+	if (err)
 		goto out;
 
-	key.ek_client = dom;	
+	key.ek_client = dom;
 	key.ek_fsidtype = fsidtype;
 	memcpy(key.ek_fsid, buf, len);
 
@@ -439,7 +439,6 @@ static int check_export(struct path *path, int *flags, unsigned char *uuid)
 		return -EINVAL;
 	}
 	return 0;
-
 }
 
 #ifdef CONFIG_NFSD_V4
@@ -546,6 +545,29 @@ static inline int
 secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; }
 #endif
 
+static int xprtsec_parse(char **mesg, char *buf, struct svc_export *exp)
+{
+	unsigned int i, mode, listsize;
+	int err;
+
+	err = get_uint(mesg, &listsize);
+	if (err)
+		return err;
+	if (listsize > NFSEXP_XPRTSEC_NUM)
+		return -EINVAL;
+
+	exp->ex_xprtsec_modes = 0;
+	for (i = 0; i < listsize; i++) {
+		err = get_uint(mesg, &mode);
+		if (err)
+			return err;
+		if (mode > NFSEXP_XPRTSEC_MTLS)
+			return -EINVAL;
+		exp->ex_xprtsec_modes |= mode;
+	}
+	return 0;
+}
+
 static inline int
 nfsd_uuid_parse(char **mesg, char *buf, unsigned char **puuid)
 {
@@ -608,11 +630,11 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 	exp.ex_client = dom;
 	exp.cd = cd;
 	exp.ex_devid_map = NULL;
+	exp.ex_xprtsec_modes = NFSEXP_XPRTSEC_ALL;
 
 	/* expiry */
-	err = -EINVAL;
-	exp.h.expiry_time = get_expiry(&mesg);
-	if (exp.h.expiry_time == 0)
+	err = get_expiry(&mesg, &exp.h.expiry_time);
+	if (err)
 		goto out3;
 
 	/* flags */
@@ -624,7 +646,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 		if (err || an_int < 0)
 			goto out3;
 		exp.ex_flags= an_int;
-	
+
 		/* anon uid */
 		err = get_int(&mesg, &an_int);
 		if (err)
@@ -650,6 +672,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 				err = nfsd_uuid_parse(&mesg, buf, &exp.ex_uuid);
 			else if (strcmp(buf, "secinfo") == 0)
 				err = secinfo_parse(&mesg, buf, &exp);
+			else if (strcmp(buf, "xprtsec") == 0)
+				err = xprtsec_parse(&mesg, buf, &exp);
 			else
 				/* quietly ignore unknown words and anything
 				 * following. Newer user-space can try to set
@@ -663,6 +687,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 		err = check_export(&exp.ex_path, &exp.ex_flags, exp.ex_uuid);
 		if (err)
 			goto out4;
+
 		/*
 		 * No point caching this if it would immediately expire.
 		 * Also, this protects exportfs's dummy export from the
@@ -824,6 +849,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
 	for (i = 0; i < MAX_SECINFO_LIST; i++) {
 		new->ex_flavors[i] = item->ex_flavors[i];
 	}
+	new->ex_xprtsec_modes = item->ex_xprtsec_modes;
 }
 
 static struct cache_head *svc_export_alloc(void)
@@ -1035,9 +1061,26 @@ static struct svc_export *exp_find(struct cache_detail *cd,
 
 __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
 {
-	struct exp_flavor_info *f;
-	struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
+	struct exp_flavor_info *f, *end = exp->ex_flavors + exp->ex_nflavors;
+	struct svc_xprt *xprt = rqstp->rq_xprt;
+
+	if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_NONE) {
+		if (!test_bit(XPT_TLS_SESSION, &xprt->xpt_flags))
+			goto ok;
+	}
+	if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_TLS) {
+		if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) &&
+		    !test_bit(XPT_PEER_AUTH, &xprt->xpt_flags))
+			goto ok;
+	}
+	if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_MTLS) {
+		if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) &&
+		    test_bit(XPT_PEER_AUTH, &xprt->xpt_flags))
+			goto ok;
+	}
+	goto denied;
 
+ok:
 	/* legacy gss-only clients are always OK: */
 	if (exp->ex_client == rqstp->rq_gssclient)
 		return 0;
@@ -1062,6 +1105,7 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
 	if (nfsd4_spo_must_allow(rqstp))
 		return 0;
 
+denied:
 	return rqstp->rq_vers < 4 ? nfserr_acces : nfserr_wrongsec;
 }
 
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index d03f7f6a8642..2df8ae25aad3 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -77,6 +77,7 @@ struct svc_export {
 	struct cache_detail	*cd;
 	struct rcu_head		ex_rcu;
 	struct export_stats	ex_stats;
+	unsigned long		ex_xprtsec_modes;
 };
 
 /* an "export key" (expkey) maps a filehandlefragement to an
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 6e8712bd7c99..ee9c923192e0 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -74,70 +74,9 @@ static struct list_lru			nfsd_file_lru;
 static unsigned long			nfsd_file_flags;
 static struct fsnotify_group		*nfsd_file_fsnotify_group;
 static struct delayed_work		nfsd_filecache_laundrette;
-static struct rhashtable		nfsd_file_rhash_tbl
+static struct rhltable			nfsd_file_rhltable
 						____cacheline_aligned_in_smp;
 
-enum nfsd_file_lookup_type {
-	NFSD_FILE_KEY_INODE,
-	NFSD_FILE_KEY_FULL,
-};
-
-struct nfsd_file_lookup_key {
-	struct inode			*inode;
-	struct net			*net;
-	const struct cred		*cred;
-	unsigned char			need;
-	bool				gc;
-	enum nfsd_file_lookup_type	type;
-};
-
-/*
- * The returned hash value is based solely on the address of an in-code
- * inode, a pointer to a slab-allocated object. The entropy in such a
- * pointer is concentrated in its middle bits.
- */
-static u32 nfsd_file_inode_hash(const struct inode *inode, u32 seed)
-{
-	unsigned long ptr = (unsigned long)inode;
-	u32 k;
-
-	k = ptr >> L1_CACHE_SHIFT;
-	k &= 0x00ffffff;
-	return jhash2(&k, 1, seed);
-}
-
-/**
- * nfsd_file_key_hashfn - Compute the hash value of a lookup key
- * @data: key on which to compute the hash value
- * @len: rhash table's key_len parameter (unused)
- * @seed: rhash table's random seed of the day
- *
- * Return value:
- *   Computed 32-bit hash value
- */
-static u32 nfsd_file_key_hashfn(const void *data, u32 len, u32 seed)
-{
-	const struct nfsd_file_lookup_key *key = data;
-
-	return nfsd_file_inode_hash(key->inode, seed);
-}
-
-/**
- * nfsd_file_obj_hashfn - Compute the hash value of an nfsd_file
- * @data: object on which to compute the hash value
- * @len: rhash table's key_len parameter (unused)
- * @seed: rhash table's random seed of the day
- *
- * Return value:
- *   Computed 32-bit hash value
- */
-static u32 nfsd_file_obj_hashfn(const void *data, u32 len, u32 seed)
-{
-	const struct nfsd_file *nf = data;
-
-	return nfsd_file_inode_hash(nf->nf_inode, seed);
-}
-
 static bool
 nfsd_match_cred(const struct cred *c1, const struct cred *c2)
 {
@@ -158,53 +97,16 @@ nfsd_match_cred(const struct cred *c1, const struct cred *c2)
 	return true;
 }
 
-/**
- * nfsd_file_obj_cmpfn - Match a cache item against search criteria
- * @arg: search criteria
- * @ptr: cache item to check
- *
- * Return values:
- *   %0 - Item matches search criteria
- *   %1 - Item does not match search criteria
- */
-static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
-			       const void *ptr)
-{
-	const struct nfsd_file_lookup_key *key = arg->key;
-	const struct nfsd_file *nf = ptr;
-
-	switch (key->type) {
-	case NFSD_FILE_KEY_INODE:
-		if (nf->nf_inode != key->inode)
-			return 1;
-		break;
-	case NFSD_FILE_KEY_FULL:
-		if (nf->nf_inode != key->inode)
-			return 1;
-		if (nf->nf_may != key->need)
-			return 1;
-		if (nf->nf_net != key->net)
-			return 1;
-		if (!nfsd_match_cred(nf->nf_cred, key->cred))
-			return 1;
-		if (!!test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc)
-			return 1;
-		if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0)
-			return 1;
-		break;
-	}
-	return 0;
-}
-
 static const struct rhashtable_params nfsd_file_rhash_params = {
 	.key_len		= sizeof_field(struct nfsd_file, nf_inode),
 	.key_offset		= offsetof(struct nfsd_file, nf_inode),
-	.head_offset		= offsetof(struct nfsd_file, nf_rhash),
-	.hashfn			= nfsd_file_key_hashfn,
-	.obj_hashfn		= nfsd_file_obj_hashfn,
-	.obj_cmpfn		= nfsd_file_obj_cmpfn,
-	/* Reduce resizing churn on light workloads */
-	.min_size		= 512,		/* buckets */
+	.head_offset		= offsetof(struct nfsd_file, nf_rlist),
+
+	/*
+	 * Start with a single page hash table to reduce resizing churn
+	 * on light workloads.
+	 */
+	.min_size		= 256,
 	.automatic_shrinking	= true,
 };
 
@@ -307,27 +209,27 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf, struct inode *inode)
 }
 
 static struct nfsd_file *
-nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
+nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need,
+		bool want_gc)
 {
 	struct nfsd_file *nf;
 
 	nf = kmem_cache_alloc(nfsd_file_slab, GFP_KERNEL);
-	if (nf) {
-		INIT_LIST_HEAD(&nf->nf_lru);
-		nf->nf_birthtime = ktime_get();
-		nf->nf_file = NULL;
-		nf->nf_cred = get_current_cred();
-		nf->nf_net = key->net;
-		nf->nf_flags = 0;
-		__set_bit(NFSD_FILE_HASHED, &nf->nf_flags);
-		__set_bit(NFSD_FILE_PENDING, &nf->nf_flags);
-		if (key->gc)
-			__set_bit(NFSD_FILE_GC, &nf->nf_flags);
-		nf->nf_inode = key->inode;
-		refcount_set(&nf->nf_ref, 1);
-		nf->nf_may = key->need;
-		nf->nf_mark = NULL;
-	}
+	if (unlikely(!nf))
+		return NULL;
+
+	INIT_LIST_HEAD(&nf->nf_lru);
+	nf->nf_birthtime = ktime_get();
+	nf->nf_file = NULL;
+	nf->nf_cred = get_current_cred();
+	nf->nf_net = net;
+	nf->nf_flags = want_gc ?
+		BIT(NFSD_FILE_HASHED) | BIT(NFSD_FILE_PENDING) | BIT(NFSD_FILE_GC) :
+		BIT(NFSD_FILE_HASHED) | BIT(NFSD_FILE_PENDING);
+	nf->nf_inode = inode;
+	refcount_set(&nf->nf_ref, 1);
+	nf->nf_may = need;
+	nf->nf_mark = NULL;
 	return nf;
 }
 
@@ -352,8 +254,8 @@ static void
 nfsd_file_hash_remove(struct nfsd_file *nf)
 {
 	trace_nfsd_file_unhash(nf);
-	rhashtable_remove_fast(&nfsd_file_rhash_tbl, &nf->nf_rhash,
-			       nfsd_file_rhash_params);
+	rhltable_remove(&nfsd_file_rhltable, &nf->nf_rlist,
+			nfsd_file_rhash_params);
 }
 
 static bool
@@ -380,10 +282,8 @@ nfsd_file_free(struct nfsd_file *nf)
 	if (nf->nf_mark)
 		nfsd_file_mark_put(nf->nf_mark);
 	if (nf->nf_file) {
-		get_file(nf->nf_file);
-		filp_close(nf->nf_file, NULL);
 		nfsd_file_check_write_error(nf);
-		fput(nf->nf_file);
+		filp_close(nf->nf_file, NULL);
 	}
 
 	/*
@@ -402,13 +302,23 @@ nfsd_file_check_writeback(struct nfsd_file *nf)
 	struct file *file = nf->nf_file;
 	struct address_space *mapping;
 
-	if (!file || !(file->f_mode & FMODE_WRITE))
+	/* File not open for write? */
+	if (!(file->f_mode & FMODE_WRITE))
+		return false;
+
+	/*
+	 * Some filesystems (e.g. NFS) flush all dirty data on close.
+	 * On others, there is no need to wait for writeback.
+	 */
+	if (!(file_inode(file)->i_sb->s_export_op->flags & EXPORT_OP_FLUSH_ON_CLOSE))
 		return false;
+
 	mapping = file->f_mapping;
 	return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) ||
 		mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
 }
 
+
 static bool nfsd_file_lru_add(struct nfsd_file *nf)
 {
 	set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
@@ -492,49 +402,26 @@ nfsd_file_dispose_list(struct list_head *dispose)
 	}
 }
 
-static void
-nfsd_file_list_remove_disposal(struct list_head *dst,
-		struct nfsd_fcache_disposal *l)
-{
-	spin_lock(&l->lock);
-	list_splice_init(&l->freeme, dst);
-	spin_unlock(&l->lock);
-}
-
-static void
-nfsd_file_list_add_disposal(struct list_head *files, struct net *net)
-{
-	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
-	struct nfsd_fcache_disposal *l = nn->fcache_disposal;
-
-	spin_lock(&l->lock);
-	list_splice_tail_init(files, &l->freeme);
-	spin_unlock(&l->lock);
-	queue_work(nfsd_filecache_wq, &l->work);
-}
-
-static void
-nfsd_file_list_add_pernet(struct list_head *dst, struct list_head *src,
-		struct net *net)
-{
-	struct nfsd_file *nf, *tmp;
-
-	list_for_each_entry_safe(nf, tmp, src, nf_lru) {
-		if (nf->nf_net == net)
-			list_move_tail(&nf->nf_lru, dst);
-	}
-}
-
+/**
+ * nfsd_file_dispose_list_delayed - move list of dead files to net's freeme list
+ * @dispose: list of nfsd_files to be disposed
+ *
+ * Transfers each file to the "freeme" list for its nfsd_net, to eventually
+ * be disposed of by the per-net garbage collector.
+ */
 static void
 nfsd_file_dispose_list_delayed(struct list_head *dispose)
 {
-	LIST_HEAD(list);
-	struct nfsd_file *nf;
-
 	while(!list_empty(dispose)) {
-		nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
-		nfsd_file_list_add_pernet(&list, dispose, nf->nf_net);
-		nfsd_file_list_add_disposal(&list, nf->nf_net);
+		struct nfsd_file *nf = list_first_entry(dispose,
+						struct nfsd_file, nf_lru);
+		struct nfsd_net *nn = net_generic(nf->nf_net, nfsd_net_id);
+		struct nfsd_fcache_disposal *l = nn->fcache_disposal;
+
+		spin_lock(&l->lock);
+		list_move_tail(&nf->nf_lru, &l->freeme);
+		spin_unlock(&l->lock);
+		queue_work(nfsd_filecache_wq, &l->work);
 	}
 }
 
@@ -678,8 +565,8 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct list_head *dispose)
  * @inode:   inode on which to close out nfsd_files
  * @dispose: list on which to gather nfsd_files to close out
  *
- * An nfsd_file represents a struct file being held open on behalf of nfsd. An
- * open file however can block other activity (such as leases), or cause
+ * An nfsd_file represents a struct file being held open on behalf of nfsd.
+ * An open file however can block other activity (such as leases), or cause
  * undesirable behavior (e.g. spurious silly-renames when reexporting NFS).
  *
  * This function is intended to find open nfsd_files when this sort of
@@ -692,20 +579,17 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct list_head *dispose)
 static void
 nfsd_file_queue_for_close(struct inode *inode, struct list_head *dispose)
 {
-	struct nfsd_file_lookup_key key = {
-		.type	= NFSD_FILE_KEY_INODE,
-		.inode	= inode,
-	};
+	struct rhlist_head *tmp, *list;
 	struct nfsd_file *nf;
 
 	rcu_read_lock();
-	do {
-		nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key,
-				       nfsd_file_rhash_params);
-		if (!nf)
-			break;
+	list = rhltable_lookup(&nfsd_file_rhltable, &inode,
+			       nfsd_file_rhash_params);
+	rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist) {
+		if (!test_bit(NFSD_FILE_GC, &nf->nf_flags))
+			continue;
 		nfsd_file_cond_queue(nf, dispose);
-	} while (1);
+	}
 	rcu_read_unlock();
 }
 
@@ -758,8 +642,8 @@ nfsd_file_close_inode_sync(struct inode *inode)
  * nfsd_file_delayed_close - close unused nfsd_files
  * @work: dummy
  *
- * Walk the LRU list and destroy any entries that have not been used since
- * the last scan.
+ * Scrape the freeme list for this nfsd_net, and then dispose of them
+ * all.
  */
 static void
 nfsd_file_delayed_close(struct work_struct *work)
@@ -768,7 +652,10 @@ nfsd_file_delayed_close(struct work_struct *work)
 	struct nfsd_fcache_disposal *l = container_of(work,
 			struct nfsd_fcache_disposal, work);
 
-	nfsd_file_list_remove_disposal(&head, l);
+	spin_lock(&l->lock);
+	list_splice_init(&l->freeme, &head);
+	spin_unlock(&l->lock);
+
 	nfsd_file_dispose_list(&head);
 }
 
@@ -829,7 +716,7 @@ nfsd_file_cache_init(void)
 	if (test_and_set_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1)
 		return 0;
 
-	ret = rhashtable_init(&nfsd_file_rhash_tbl, &nfsd_file_rhash_params);
+	ret = rhltable_init(&nfsd_file_rhltable, &nfsd_file_rhash_params);
 	if (ret)
 		return ret;
 
@@ -897,7 +784,7 @@ out_err:
 	nfsd_file_mark_slab = NULL;
 	destroy_workqueue(nfsd_filecache_wq);
 	nfsd_filecache_wq = NULL;
-	rhashtable_destroy(&nfsd_file_rhash_tbl);
+	rhltable_destroy(&nfsd_file_rhltable);
 	goto out;
 }
 
@@ -906,7 +793,8 @@ out_err:
  * @net: net-namespace to shut down the cache (may be NULL)
  *
  * Walk the nfsd_file cache and close out any that match @net. If @net is NULL,
- * then close out everything. Called when an nfsd instance is being shut down.
+ * then close out everything. Called when an nfsd instance is being shut down,
+ * and when the exports table is flushed.
  */
 static void
 __nfsd_file_cache_purge(struct net *net)
@@ -915,7 +803,7 @@ __nfsd_file_cache_purge(struct net *net)
 	struct nfsd_file *nf;
 	LIST_HEAD(dispose);
 
-	rhashtable_walk_enter(&nfsd_file_rhash_tbl, &iter);
+	rhltable_walk_enter(&nfsd_file_rhltable, &iter);
 	do {
 		rhashtable_walk_start(&iter);
 
@@ -1021,7 +909,7 @@ nfsd_file_cache_shutdown(void)
 	nfsd_file_mark_slab = NULL;
 	destroy_workqueue(nfsd_filecache_wq);
 	nfsd_filecache_wq = NULL;
-	rhashtable_destroy(&nfsd_file_rhash_tbl);
+	rhltable_destroy(&nfsd_file_rhltable);
 
 	for_each_possible_cpu(i) {
 		per_cpu(nfsd_file_cache_hits, i) = 0;
@@ -1032,6 +920,35 @@ nfsd_file_cache_shutdown(void)
 	}
 }
 
+static struct nfsd_file *
+nfsd_file_lookup_locked(const struct net *net, const struct cred *cred,
+			struct inode *inode, unsigned char need,
+			bool want_gc)
+{
+	struct rhlist_head *tmp, *list;
+	struct nfsd_file *nf;
+
+	list = rhltable_lookup(&nfsd_file_rhltable, &inode,
+			       nfsd_file_rhash_params);
+	rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist) {
+		if (nf->nf_may != need)
+			continue;
+		if (nf->nf_net != net)
+			continue;
+		if (!nfsd_match_cred(nf->nf_cred, cred))
+			continue;
+		if (test_bit(NFSD_FILE_GC, &nf->nf_flags) != want_gc)
+			continue;
+		if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0)
+			continue;
+
+		if (!nfsd_file_get(nf))
+			continue;
+		return nf;
+	}
+	return NULL;
+}
+
 /**
  * nfsd_file_is_cached - are there any cached open files for this inode?
  * @inode: inode to check
@@ -1046,15 +963,20 @@ nfsd_file_cache_shutdown(void)
 bool
 nfsd_file_is_cached(struct inode *inode)
 {
-	struct nfsd_file_lookup_key key = {
-		.type	= NFSD_FILE_KEY_INODE,
-		.inode	= inode,
-	};
+	struct rhlist_head *tmp, *list;
+	struct nfsd_file *nf;
 	bool ret = false;
 
-	if (rhashtable_lookup_fast(&nfsd_file_rhash_tbl, &key,
-				   nfsd_file_rhash_params) != NULL)
-		ret = true;
+	rcu_read_lock();
+	list = rhltable_lookup(&nfsd_file_rhltable, &inode,
+			       nfsd_file_rhash_params);
+	rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist)
+		if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
+			ret = true;
+			break;
+		}
+	rcu_read_unlock();
+
 	trace_nfsd_file_is_cached(inode, (int)ret);
 	return ret;
 }
@@ -1064,14 +986,12 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		     unsigned int may_flags, struct file *file,
 		     struct nfsd_file **pnf, bool want_gc)
 {
-	struct nfsd_file_lookup_key key = {
-		.type	= NFSD_FILE_KEY_FULL,
-		.need	= may_flags & NFSD_FILE_MAY_MASK,
-		.net	= SVC_NET(rqstp),
-		.gc	= want_gc,
-	};
+	unsigned char need = may_flags & NFSD_FILE_MAY_MASK;
+	struct net *net = SVC_NET(rqstp);
+	struct nfsd_file *new, *nf;
+	const struct cred *cred;
 	bool open_retry = true;
-	struct nfsd_file *nf;
+	struct inode *inode;
 	__be32 status;
 	int ret;
 
@@ -1079,80 +999,88 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
 				may_flags|NFSD_MAY_OWNER_OVERRIDE);
 	if (status != nfs_ok)
 		return status;
-	key.inode = d_inode(fhp->fh_dentry);
-	key.cred = get_current_cred();
+	inode = d_inode(fhp->fh_dentry);
+	cred = get_current_cred();
 
 retry:
 	rcu_read_lock();
-	nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key,
-			       nfsd_file_rhash_params);
-	nf = nfsd_file_get(nf);
+	nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc);
 	rcu_read_unlock();
 
 	if (nf) {
+		/*
+		 * If the nf is on the LRU then it holds an extra reference
+		 * that must be put if it's removed. It had better not be
+		 * the last one however, since we should hold another.
+		 */
 		if (nfsd_file_lru_remove(nf))
 			WARN_ON_ONCE(refcount_dec_and_test(&nf->nf_ref));
 		goto wait_for_construction;
 	}
 
-	nf = nfsd_file_alloc(&key, may_flags);
-	if (!nf) {
+	new = nfsd_file_alloc(net, inode, need, want_gc);
+	if (!new) {
 		status = nfserr_jukebox;
-		goto out_status;
+		goto out;
 	}
 
-	ret = rhashtable_lookup_insert_key(&nfsd_file_rhash_tbl,
-					   &key, &nf->nf_rhash,
-					   nfsd_file_rhash_params);
+	rcu_read_lock();
+	spin_lock(&inode->i_lock);
+	nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc);
+	if (unlikely(nf)) {
+		spin_unlock(&inode->i_lock);
+		rcu_read_unlock();
+		nfsd_file_slab_free(&new->nf_rcu);
+		goto wait_for_construction;
+	}
+	nf = new;
+	ret = rhltable_insert(&nfsd_file_rhltable, &nf->nf_rlist,
+			      nfsd_file_rhash_params);
+	spin_unlock(&inode->i_lock);
+	rcu_read_unlock();
 	if (likely(ret == 0))
 		goto open_file;
 
-	nfsd_file_slab_free(&nf->nf_rcu);
-	nf = NULL;
 	if (ret == -EEXIST)
 		goto retry;
-	trace_nfsd_file_insert_err(rqstp, key.inode, may_flags, ret);
+	trace_nfsd_file_insert_err(rqstp, inode, may_flags, ret);
 	status = nfserr_jukebox;
-	goto out_status;
+	goto construction_err;
 
 wait_for_construction:
 	wait_on_bit(&nf->nf_flags, NFSD_FILE_PENDING, TASK_UNINTERRUPTIBLE);
 
 	/* Did construction of this file fail? */
 	if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
-		trace_nfsd_file_cons_err(rqstp, key.inode, may_flags, nf);
+		trace_nfsd_file_cons_err(rqstp, inode, may_flags, nf);
 		if (!open_retry) {
 			status = nfserr_jukebox;
-			goto out;
+			goto construction_err;
 		}
 		open_retry = false;
-		if (refcount_dec_and_test(&nf->nf_ref))
-			nfsd_file_free(nf);
 		goto retry;
 	}
-
 	this_cpu_inc(nfsd_file_cache_hits);
 
 	status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags));
+	if (status != nfs_ok) {
+		nfsd_file_put(nf);
+		nf = NULL;
+	}
+
 out:
 	if (status == nfs_ok) {
 		this_cpu_inc(nfsd_file_acquisitions);
 		nfsd_file_check_write_error(nf);
 		*pnf = nf;
-	} else {
-		if (refcount_dec_and_test(&nf->nf_ref))
-			nfsd_file_free(nf);
-		nf = NULL;
 	}
-
-out_status:
-	put_cred(key.cred);
-	trace_nfsd_file_acquire(rqstp, key.inode, may_flags, nf, status);
+	put_cred(cred);
+	trace_nfsd_file_acquire(rqstp, inode, may_flags, nf, status);
 	return status;
 
 open_file:
 	trace_nfsd_file_alloc(nf);
-	nf->nf_mark = nfsd_file_mark_find_or_create(nf, key.inode);
+	nf->nf_mark = nfsd_file_mark_find_or_create(nf, inode);
 	if (nf->nf_mark) {
 		if (file) {
 			get_file(file);
@@ -1170,13 +1098,16 @@ open_file:
 	 * If construction failed, or we raced with a call to unlink()
 	 * then unhash.
 	 */
-	if (status == nfs_ok && key.inode->i_nlink == 0)
-		status = nfserr_jukebox;
-	if (status != nfs_ok)
+	if (status != nfs_ok || inode->i_nlink == 0)
 		nfsd_file_unhash(nf);
-	clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
-	smp_mb__after_atomic();
-	wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
+	clear_and_wake_up_bit(NFSD_FILE_PENDING, &nf->nf_flags);
+	if (status == nfs_ok)
+		goto out;
+
+construction_err:
+	if (refcount_dec_and_test(&nf->nf_ref))
+		nfsd_file_free(nf);
+	nf = NULL;
 	goto out;
 }
 
@@ -1192,8 +1123,11 @@ open_file:
  * seconds after the final nfsd_file_put() in case the caller
  * wants to re-use it.
  *
- * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
- * network byte order is returned.
+ * Return values:
+ *   %nfs_ok - @pnf points to an nfsd_file with its reference
+ *   count boosted.
+ *
+ * On error, an nfsstat value in network byte order is returned.
  */
 __be32
 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
@@ -1213,8 +1147,11 @@ nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
  * but not garbage-collected. The object is unhashed after the
  * final nfsd_file_put().
  *
- * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
- * network byte order is returned.
+ * Return values:
+ *   %nfs_ok - @pnf points to an nfsd_file with its reference
+ *   count boosted.
+ *
+ * On error, an nfsstat value in network byte order is returned.
  */
 __be32
 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
@@ -1235,8 +1172,11 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
  * and @file is non-NULL, use it to instantiate a new nfsd_file instead of
  * opening a new one.
  *
- * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
- * network byte order is returned.
+ * Return values:
+ *   %nfs_ok - @pnf points to an nfsd_file with its reference
+ *   count boosted.
+ *
+ * On error, an nfsstat value in network byte order is returned.
  */
 __be32
 nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp,
@@ -1267,7 +1207,7 @@ int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
 		lru = list_lru_count(&nfsd_file_lru);
 
 		rcu_read_lock();
-		ht = &nfsd_file_rhash_tbl;
+		ht = &nfsd_file_rhltable.ht;
 		count = atomic_read(&ht->nelems);
 		tbl = rht_dereference_rcu(ht->tbl, ht);
 		buckets = tbl->size;
@@ -1283,7 +1223,7 @@ int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
 		evictions += per_cpu(nfsd_file_evictions, i);
 	}
 
-	seq_printf(m, "total entries: %u\n", count);
+	seq_printf(m, "total inodes:  %u\n", count);
 	seq_printf(m, "hash buckets:  %u\n", buckets);
 	seq_printf(m, "lru entries:   %lu\n", lru);
 	seq_printf(m, "cache hits:    %lu\n", hits);
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 41516a4263ea..e54165a3224f 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -29,9 +29,8 @@ struct nfsd_file_mark {
  * never be dereferenced, only used for comparison.
  */
 struct nfsd_file {
-	struct rhash_head	nf_rhash;
-	struct list_head	nf_lru;
-	struct rcu_head		nf_rcu;
+	struct rhlist_head	nf_rlist;
+	void			*nf_inode;
 	struct file		*nf_file;
 	const struct cred	*nf_cred;
 	struct net		*nf_net;
@@ -40,10 +39,12 @@ struct nfsd_file {
 #define NFSD_FILE_REFERENCED	(2)
 #define NFSD_FILE_GC		(3)
 	unsigned long		nf_flags;
-	struct inode		*nf_inode;	/* don't deref */
 	refcount_t		nf_ref;
 	unsigned char		nf_may;
+
 	struct nfsd_file_mark	*nf_mark;
+	struct list_head	nf_lru;
+	struct rcu_head		nf_rcu;
 	ktime_t			nf_birthtime;
 };
 
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 5e9809aff37e..7a806ac13e31 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -240,8 +240,8 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
 		goto out;
 
 	/* expiry */
-	ent.h.expiry_time = get_expiry(&buf);
-	if (ent.h.expiry_time == 0)
+	error = get_expiry(&buf, &ent.h.expiry_time);
+	if (error)
 		goto out;
 
 	error = -ENOMEM;
@@ -408,8 +408,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
 	memcpy(ent.name, buf1, sizeof(ent.name));
 
 	/* expiry */
-	ent.h.expiry_time = get_expiry(&buf);
-	if (ent.h.expiry_time == 0)
+	error = get_expiry(&buf, &ent.h.expiry_time);
+	if (error)
 		goto out;
 
 	/* ID */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 5783209f17fc..bb9d47172162 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -930,6 +930,9 @@ nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags,
  * Grab and keep cached pages associated with a file in the svc_rqst
  * so that they can be passed to the network sendmsg/sendpage routines
  * directly. They will be released after the sending has completed.
+ *
+ * Return values: Number of bytes consumed, or -EIO if there are no
+ * remaining pages in rqstp->rq_pages.
  */
 static int
 nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
@@ -948,7 +951,8 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 		 */
 		if (page == *(rqstp->rq_next_page - 1))
 			continue;
-		svc_rqst_replace_page(rqstp, page);
+		if (unlikely(!svc_rqst_replace_page(rqstp, page)))
+			return -EIO;
 	}
 	if (rqstp->rq_res.page_len == 0)	// first call
 		rqstp->rq_res.page_base = offset % PAGE_SIZE;
@@ -2164,7 +2168,7 @@ nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name,
 		goto out;
 	}
 
-	buf = kvmalloc(len, GFP_KERNEL | GFP_NOFS);
+	buf = kvmalloc(len, GFP_KERNEL);
 	if (buf == NULL) {
 		err = nfserr_jukebox;
 		goto out;
@@ -2227,10 +2231,7 @@ nfsd_listxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char **bufp,
 		goto out;
 	}
 
-	/*
-	 * We're holding i_rwsem - use GFP_NOFS.
-	 */
-	buf = kvmalloc(len, GFP_KERNEL | GFP_NOFS);
+	buf = kvmalloc(len, GFP_KERNEL);
 	if (buf == NULL) {
 		err = nfserr_jukebox;
 		goto out;
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 41ccd43cd979..5cf30827f244 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -259,10 +259,10 @@ repeat:
 			NILFS_PAGE_BUG(&folio->page, "inconsistent dirty state");
 
 		dfolio = filemap_grab_folio(dmap, folio->index);
-		if (unlikely(!dfolio)) {
+		if (unlikely(IS_ERR(dfolio))) {
 			/* No empty page is added to the page cache */
-			err = -ENOMEM;
 			folio_unlock(folio);
+			err = PTR_ERR(dfolio);
 			break;
 		}
 		if (unlikely(!folio_buffers(folio)))
@@ -311,7 +311,7 @@ repeat:
 
 		folio_lock(folio);
 		dfolio = filemap_lock_folio(dmap, index);
-		if (dfolio) {
+		if (!IS_ERR(dfolio)) {
 			/* overwrite existing folio in the destination cache */
 			WARN_ON(folio_test_dirty(dfolio));
 			nilfs_copy_page(&dfolio->page, &folio->page, 0);
diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c
index a030d00af90c..174fe536a1c0 100644
--- a/fs/ntfs/sysctl.c
+++ b/fs/ntfs/sysctl.c
@@ -31,16 +31,6 @@ static struct ctl_table ntfs_sysctls[] = {
 	{}
 };
 
-/* Define the parent directory /proc/sys/fs. */
-static struct ctl_table sysctls_root[] = {
-	{
-		.procname	= "fs",
-		.mode		= 0555,
-		.child		= ntfs_sysctls
-	},
-	{}
-};
-
 /* Storage for the sysctls header. */
 static struct ctl_table_header *sysctls_root_table;
 
@@ -54,7 +44,7 @@ int ntfs_sysctl(int add)
 {
 	if (add) {
 		BUG_ON(sysctls_root_table);
-		sysctls_root_table = register_sysctl_table(sysctls_root);
+		sysctls_root_table = register_sysctl("fs", ntfs_sysctls);
 		if (!sysctls_root_table)
 			return -ENOMEM;
 	} else {
diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c
index 5e6bafb10f42..0b8bc66377db 100644
--- a/fs/ntfs3/attrib.c
+++ b/fs/ntfs3/attrib.c
@@ -405,8 +405,8 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type,
 	int err = 0;
 	struct ntfs_sb_info *sbi = ni->mi.sbi;
 	u8 cluster_bits = sbi->cluster_bits;
-	bool is_mft =
-		ni->mi.rno == MFT_REC_MFT && type == ATTR_DATA && !name_len;
+	bool is_mft = ni->mi.rno == MFT_REC_MFT && type == ATTR_DATA &&
+		      !name_len;
 	u64 old_valid, old_size, old_alloc, new_alloc, new_alloc_tmp;
 	struct ATTRIB *attr = NULL, *attr_b;
 	struct ATTR_LIST_ENTRY *le, *le_b;
@@ -531,11 +531,10 @@ add_alloc_in_same_attr_seg:
 			pre_alloc = 0;
 			if (type == ATTR_DATA && !name_len &&
 			    sbi->options->prealloc) {
-				pre_alloc =
-					bytes_to_cluster(
-						sbi,
-						get_pre_allocated(new_size)) -
-					new_alen;
+				pre_alloc = bytes_to_cluster(
+						    sbi, get_pre_allocated(
+								 new_size)) -
+					    new_alen;
 			}
 
 			/* Get the last LCN to allocate from. */
@@ -573,8 +572,8 @@ add_alloc_in_same_attr_seg:
 			err = attr_allocate_clusters(
 				sbi, run, vcn, lcn, to_allocate, &pre_alloc,
 				is_mft ? ALLOCATE_MFT : ALLOCATE_DEF, &alen,
-				is_mft ? 0
-				       : (sbi->record_size -
+				is_mft ? 0 :
+					       (sbi->record_size -
 					  le32_to_cpu(rec->used) + 8) /
 							 3 +
 						 1,
diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c
index 723fb64e6531..9a6c6a09d70c 100644
--- a/fs/ntfs3/bitmap.c
+++ b/fs/ntfs3/bitmap.c
@@ -40,9 +40,9 @@ static struct kmem_cache *ntfs_enode_cachep;
 
 int __init ntfs3_init_bitmap(void)
 {
-	ntfs_enode_cachep =
-		kmem_cache_create("ntfs3_enode_cache", sizeof(struct e_node), 0,
-				  SLAB_RECLAIM_ACCOUNT, NULL);
+	ntfs_enode_cachep = kmem_cache_create("ntfs3_enode_cache",
+					      sizeof(struct e_node), 0,
+					      SLAB_RECLAIM_ACCOUNT, NULL);
 	return ntfs_enode_cachep ? 0 : -ENOMEM;
 }
 
@@ -286,9 +286,9 @@ static void wnd_add_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len,
 		if (wnd->uptodated != 1) {
 			/* Check bits before 'bit'. */
 			ib = wnd->zone_bit == wnd->zone_end ||
-					     bit < wnd->zone_end
-				     ? 0
-				     : wnd->zone_end;
+					     bit < wnd->zone_end ?
+					   0 :
+					   wnd->zone_end;
 
 			while (bit > ib && wnd_is_free_hlp(wnd, bit - 1, 1)) {
 				bit -= 1;
@@ -297,9 +297,9 @@ static void wnd_add_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len,
 
 			/* Check bits after 'end_in'. */
 			ib = wnd->zone_bit == wnd->zone_end ||
-					     end_in > wnd->zone_bit
-				     ? wnd->nbits
-				     : wnd->zone_bit;
+					     end_in > wnd->zone_bit ?
+					   wnd->nbits :
+					   wnd->zone_bit;
 
 			while (end_in < ib && wnd_is_free_hlp(wnd, end_in, 1)) {
 				end_in += 1;
@@ -417,8 +417,8 @@ static void wnd_remove_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len)
 			return;
 		n3 = rb_first(&wnd->count_tree);
 		wnd->extent_max =
-			n3 ? rb_entry(n3, struct e_node, count.node)->count.key
-			   : 0;
+			n3 ? rb_entry(n3, struct e_node, count.node)->count.key :
+				   0;
 		return;
 	}
 
@@ -658,7 +658,8 @@ int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits)
 	if (!wnd->bits_last)
 		wnd->bits_last = wbits;
 
-	wnd->free_bits = kcalloc(wnd->nwnd, sizeof(u16), GFP_NOFS | __GFP_NOWARN);
+	wnd->free_bits =
+		kcalloc(wnd->nwnd, sizeof(u16), GFP_NOFS | __GFP_NOWARN);
 	if (!wnd->free_bits)
 		return -ENOMEM;
 
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index e9bdc1ff08c9..9a3d55c367d9 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -22,20 +22,21 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg)
 {
 	struct fstrim_range __user *user_range;
 	struct fstrim_range range;
+	struct block_device *dev;
 	int err;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (!bdev_max_discard_sectors(sbi->sb->s_bdev))
+	dev = sbi->sb->s_bdev;
+	if (!bdev_max_discard_sectors(dev))
 		return -EOPNOTSUPP;
 
 	user_range = (struct fstrim_range __user *)arg;
 	if (copy_from_user(&range, user_range, sizeof(range)))
 		return -EFAULT;
 
-	range.minlen = max_t(u32, range.minlen,
-			     bdev_discard_granularity(sbi->sb->s_bdev));
+	range.minlen = max_t(u32, range.minlen, bdev_discard_granularity(dev));
 
 	err = ntfs_trim_fs(sbi, &range);
 	if (err < 0)
@@ -190,8 +191,8 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
 
 	for (; idx < idx_end; idx += 1, from = 0) {
 		page_off = (loff_t)idx << PAGE_SHIFT;
-		to = (page_off + PAGE_SIZE) > vbo_to ? (vbo_to - page_off)
-						     : PAGE_SIZE;
+		to = (page_off + PAGE_SIZE) > vbo_to ? (vbo_to - page_off) :
+							     PAGE_SIZE;
 		iblock = page_off >> inode->i_blkbits;
 
 		page = find_or_create_page(mapping, idx,
@@ -223,16 +224,10 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
 				set_buffer_uptodate(bh);
 
 			if (!buffer_uptodate(bh)) {
-				lock_buffer(bh);
-				bh->b_end_io = end_buffer_read_sync;
-				get_bh(bh);
-				submit_bh(REQ_OP_READ, bh);
-
-				wait_on_buffer(bh);
-				if (!buffer_uptodate(bh)) {
+				err = bh_read(bh, 0);
+				if (err < 0) {
 					unlock_page(page);
 					put_page(page);
-					err = -EIO;
 					goto out;
 				}
 			}
@@ -570,13 +565,14 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
 		ni_unlock(ni);
 	} else {
 		/* Check new size. */
+		u8 cluster_bits = sbi->cluster_bits;
 
 		/* generic/213: expected -ENOSPC instead of -EFBIG. */
 		if (!is_supported_holes) {
 			loff_t to_alloc = new_size - inode_get_bytes(inode);
 
 			if (to_alloc > 0 &&
-			    (to_alloc >> sbi->cluster_bits) >
+			    (to_alloc >> cluster_bits) >
 				    wnd_zeroes(&sbi->used.bitmap)) {
 				err = -ENOSPC;
 				goto out;
@@ -597,7 +593,7 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
 		}
 
 		if (is_supported_holes) {
-			CLST vcn = vbo >> sbi->cluster_bits;
+			CLST vcn = vbo >> cluster_bits;
 			CLST cend = bytes_to_cluster(sbi, end);
 			CLST cend_v = bytes_to_cluster(sbi, ni->i_valid);
 			CLST lcn, clen;
@@ -660,22 +656,12 @@ out:
 int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 		  struct iattr *attr)
 {
-	struct super_block *sb = dentry->d_sb;
-	struct ntfs_sb_info *sbi = sb->s_fs_info;
 	struct inode *inode = d_inode(dentry);
 	struct ntfs_inode *ni = ntfs_i(inode);
 	u32 ia_valid = attr->ia_valid;
 	umode_t mode = inode->i_mode;
 	int err;
 
-	if (sbi->options->noacsrules) {
-		/* "No access rules" - Force any changes of time etc. */
-		attr->ia_valid |= ATTR_FORCE;
-		/* and disable for editing some attributes. */
-		attr->ia_valid &= ~(ATTR_UID | ATTR_GID | ATTR_MODE);
-		ia_valid = attr->ia_valid;
-	}
-
 	err = setattr_prepare(idmap, dentry, attr);
 	if (err)
 		goto out;
@@ -719,7 +705,7 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 	}
 
 	if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE))
-		ntfs_save_wsl_perm(inode);
+		ntfs_save_wsl_perm(inode, NULL);
 	mark_inode_dirty(inode);
 out:
 	return err;
@@ -1065,8 +1051,8 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (ret)
 		goto out;
 
-	ret = is_compressed(ni) ? ntfs_compress_write(iocb, from)
-				: __generic_file_write_iter(iocb, from);
+	ret = is_compressed(ni) ? ntfs_compress_write(iocb, from) :
+					__generic_file_write_iter(iocb, from);
 
 out:
 	inode_unlock(inode);
@@ -1118,8 +1104,9 @@ static int ntfs_file_release(struct inode *inode, struct file *file)
 	int err = 0;
 
 	/* If we are last writer on the inode, drop the block reservation. */
-	if (sbi->options->prealloc && ((file->f_mode & FMODE_WRITE) &&
-				      atomic_read(&inode->i_writecount) == 1)) {
+	if (sbi->options->prealloc &&
+	    ((file->f_mode & FMODE_WRITE) &&
+	     atomic_read(&inode->i_writecount) == 1)) {
 		ni_lock(ni);
 		down_write(&ni->file.run_lock);
 
@@ -1159,8 +1146,7 @@ const struct inode_operations ntfs_file_inode_operations = {
 	.getattr	= ntfs_getattr,
 	.setattr	= ntfs3_setattr,
 	.listxattr	= ntfs_listxattr,
-	.permission	= ntfs_permission,
-	.get_inode_acl	= ntfs_get_acl,
+	.get_acl	= ntfs_get_acl,
 	.set_acl	= ntfs_set_acl,
 	.fiemap		= ntfs_fiemap,
 };
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index f1df52dfab74..2bfcf1a989c9 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -76,8 +76,8 @@ struct ATTR_STD_INFO *ni_std(struct ntfs_inode *ni)
 	const struct ATTRIB *attr;
 
 	attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL);
-	return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO))
-		    : NULL;
+	return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO)) :
+			    NULL;
 }
 
 /*
@@ -91,8 +91,8 @@ struct ATTR_STD_INFO5 *ni_std5(struct ntfs_inode *ni)
 
 	attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL);
 
-	return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO5))
-		    : NULL;
+	return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO5)) :
+			    NULL;
 }
 
 /*
@@ -102,7 +102,7 @@ void ni_clear(struct ntfs_inode *ni)
 {
 	struct rb_node *node;
 
-	if (!ni->vfs_inode.i_nlink && is_rec_inuse(ni->mi.mrec))
+	if (!ni->vfs_inode.i_nlink && ni->mi.mrec && is_rec_inuse(ni->mi.mrec))
 		ni_delete_all(ni);
 
 	al_destroy(ni);
@@ -1439,8 +1439,8 @@ int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type,
 	int err;
 	CLST plen;
 	struct ATTRIB *attr;
-	bool is_ext =
-		(flags & (ATTR_FLAG_SPARSED | ATTR_FLAG_COMPRESSED)) && !svcn;
+	bool is_ext = (flags & (ATTR_FLAG_SPARSED | ATTR_FLAG_COMPRESSED)) &&
+		      !svcn;
 	u32 name_size = ALIGN(name_len * sizeof(short), 8);
 	u32 name_off = is_ext ? SIZEOF_NONRESIDENT_EX : SIZEOF_NONRESIDENT;
 	u32 run_off = name_off + name_size;
@@ -1645,7 +1645,7 @@ struct ATTR_FILE_NAME *ni_fname_name(struct ntfs_inode *ni,
 {
 	struct ATTRIB *attr = NULL;
 	struct ATTR_FILE_NAME *fname;
-       struct le_str *fns;
+	struct le_str *fns;
 
 	if (le)
 		*le = NULL;
@@ -1756,9 +1756,9 @@ int ni_new_attr_flags(struct ntfs_inode *ni, enum FILE_ATTRIBUTE new_fa)
 	}
 
 	/* Resize nonresident empty attribute in-place only. */
-	new_asize = (new_aflags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED))
-			    ? (SIZEOF_NONRESIDENT_EX + 8)
-			    : (SIZEOF_NONRESIDENT + 8);
+	new_asize = (new_aflags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) ?
+				  (SIZEOF_NONRESIDENT_EX + 8) :
+				  (SIZEOF_NONRESIDENT + 8);
 
 	if (!mi_resize_attr(mi, attr, new_asize - le32_to_cpu(attr->size)))
 		return -EOPNOTSUPP;
@@ -2965,14 +2965,14 @@ bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
 {
 	struct ntfs_sb_info *sbi = ni->mi.sbi;
 	struct ATTRIB *attr;
-	u16 de_key_size = de2 ? le16_to_cpu(de2->key_size) : 0;
+	u16 de_key_size;
 
 	switch (undo_step) {
 	case 4:
+		de_key_size = le16_to_cpu(de2->key_size);
 		if (ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0,
-				       &attr, NULL, NULL)) {
+				       &attr, NULL, NULL))
 			return false;
-		}
 		memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de2 + 1, de_key_size);
 
 		mi_get_ref(&ni->mi, &de2->ref);
@@ -2981,19 +2981,16 @@ bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
 		de2->flags = 0;
 		de2->res = 0;
 
-		if (indx_insert_entry(&dir_ni->dir, dir_ni, de2, sbi, NULL,
-				      1)) {
+		if (indx_insert_entry(&dir_ni->dir, dir_ni, de2, sbi, NULL, 1))
 			return false;
-		}
 		fallthrough;
 
 	case 2:
 		de_key_size = le16_to_cpu(de->key_size);
 
 		if (ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0,
-				       &attr, NULL, NULL)) {
+				       &attr, NULL, NULL))
 			return false;
-		}
 
 		memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de + 1, de_key_size);
 		mi_get_ref(&ni->mi, &de->ref);
@@ -3162,9 +3159,9 @@ static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup,
 			u64 data_size = le64_to_cpu(attr->nres.data_size);
 			__le64 valid_le;
 
-			dup->alloc_size = is_attr_ext(attr)
-						  ? attr->nres.total_size
-						  : attr->nres.alloc_size;
+			dup->alloc_size = is_attr_ext(attr) ?
+							attr->nres.total_size :
+							attr->nres.alloc_size;
 			dup->data_size = attr->nres.data_size;
 
 			if (new_valid > data_size)
@@ -3258,6 +3255,9 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
 		return 0;
 	}
 
+	if (!ni->mi.mrec)
+		goto out;
+
 	if (is_rec_inuse(ni->mi.mrec) &&
 	    !(sbi->flags & NTFS_FLAGS_LOG_REPLAYING) && inode->i_nlink) {
 		bool modified = false;
@@ -3360,7 +3360,7 @@ out:
 	ni_unlock(ni);
 
 	if (err) {
-		ntfs_err(sb, "%s r=%lx failed, %d.", hint, inode->i_ino, err);
+		ntfs_inode_err(inode, "%s failed, %d.", hint, err);
 		ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
 		return err;
 	}
diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c
index c6eb371a3695..57762c5fe68b 100644
--- a/fs/ntfs3/fslog.c
+++ b/fs/ntfs3/fslog.c
@@ -827,10 +827,10 @@ static inline struct RESTART_TABLE *extend_rsttbl(struct RESTART_TABLE *tbl,
 
 	memcpy(rt + 1, tbl + 1, esize * used);
 
-	rt->free_goal = free_goal == ~0u
-				? cpu_to_le32(~0u)
-				: cpu_to_le32(sizeof(struct RESTART_TABLE) +
-					      free_goal * esize);
+	rt->free_goal = free_goal == ~0u ?
+				      cpu_to_le32(~0u) :
+				      cpu_to_le32(sizeof(struct RESTART_TABLE) +
+					    free_goal * esize);
 
 	if (tbl->first_free) {
 		rt->first_free = tbl->first_free;
@@ -1089,9 +1089,9 @@ static inline u64 base_lsn(struct ntfs_log *log,
 		    (lsn < (lsn_to_vbo(log, h_lsn) & ~log->page_mask) ? 1 : 0))
 		   << log->file_data_bits) +
 		  ((((is_log_record_end(hdr) &&
-		      h_lsn <= le64_to_cpu(hdr->record_hdr.last_end_lsn))
-			     ? le16_to_cpu(hdr->record_hdr.next_record_off)
-			     : log->page_size) +
+		      h_lsn <= le64_to_cpu(hdr->record_hdr.last_end_lsn)) ?
+				   le16_to_cpu(hdr->record_hdr.next_record_off) :
+				   log->page_size) +
 		    lsn) >>
 		   3);
 
@@ -1298,9 +1298,9 @@ static void log_init_pg_hdr(struct ntfs_log *log, u32 sys_page_size,
 	if (!log->clst_per_page)
 		log->clst_per_page = 1;
 
-	log->first_page = major_ver >= 2
-				  ? 0x22 * page_size
-				  : ((sys_page_size << 1) + (page_size << 1));
+	log->first_page = major_ver >= 2 ?
+					0x22 * page_size :
+					((sys_page_size << 1) + (page_size << 1));
 	log->major_ver = major_ver;
 	log->minor_ver = minor_ver;
 }
@@ -1512,20 +1512,19 @@ static u32 current_log_avail(struct ntfs_log *log)
 	 * have to compute the free range.
 	 * If there is no oldest lsn then start at the first page of the file.
 	 */
-	oldest_off = (log->l_flags & NTFSLOG_NO_OLDEST_LSN)
-			     ? log->first_page
-			     : (log->oldest_lsn_off & ~log->sys_page_mask);
+	oldest_off = (log->l_flags & NTFSLOG_NO_OLDEST_LSN) ?
+				   log->first_page :
+				   (log->oldest_lsn_off & ~log->sys_page_mask);
 
 	/*
 	 * We will use the next log page offset to compute the next free page.
 	 * If we are going to reuse this page go to the next page.
 	 * If we are at the first page then use the end of the file.
 	 */
-	next_free_off = (log->l_flags & NTFSLOG_REUSE_TAIL)
-				? log->next_page + log->page_size
-				: log->next_page == log->first_page
-					  ? log->l_size
-					  : log->next_page;
+	next_free_off = (log->l_flags & NTFSLOG_REUSE_TAIL) ?
+				      log->next_page + log->page_size :
+			log->next_page == log->first_page ? log->l_size :
+								  log->next_page;
 
 	/* If the two offsets are the same then there is no available space. */
 	if (oldest_off == next_free_off)
@@ -1535,9 +1534,9 @@ static u32 current_log_avail(struct ntfs_log *log)
 	 * this range from the total available pages.
 	 */
 	free_bytes =
-		oldest_off < next_free_off
-			? log->total_avail_pages - (next_free_off - oldest_off)
-			: oldest_off - next_free_off;
+		oldest_off < next_free_off ?
+			      log->total_avail_pages - (next_free_off - oldest_off) :
+			      oldest_off - next_free_off;
 
 	free_bytes >>= log->page_bits;
 	return free_bytes * log->reserved;
@@ -1671,8 +1670,8 @@ next_tail:
 	}
 
 	best_lsn1 = first_tail ? base_lsn(log, first_tail, first_file_off) : 0;
-	best_lsn2 =
-		second_tail ? base_lsn(log, second_tail, second_file_off) : 0;
+	best_lsn2 = second_tail ? base_lsn(log, second_tail, second_file_off) :
+					0;
 
 	if (first_tail && second_tail) {
 		if (best_lsn1 > best_lsn2) {
@@ -1767,8 +1766,8 @@ tail_read:
 
 	page_cnt = page_pos = 1;
 
-	curpage_off = seq_base == log->seq_num ? min(log->next_page, page_off)
-					       : log->next_page;
+	curpage_off = seq_base == log->seq_num ? min(log->next_page, page_off) :
+						       log->next_page;
 
 	wrapped_file =
 		curpage_off == log->first_page &&
@@ -1826,9 +1825,9 @@ use_cur_page:
 			    le64_to_cpu(cur_page->record_hdr.last_end_lsn) &&
 		    ((lsn_cur >> log->file_data_bits) +
 		     ((curpage_off <
-		       (lsn_to_vbo(log, lsn_cur) & ~log->page_mask))
-			      ? 1
-			      : 0)) != expected_seq) {
+		       (lsn_to_vbo(log, lsn_cur) & ~log->page_mask)) ?
+				    1 :
+				    0)) != expected_seq) {
 			goto check_tail;
 		}
 
@@ -2575,7 +2574,7 @@ static int read_next_log_rec(struct ntfs_log *log, struct lcb *lcb, u64 *lsn)
 	return find_log_rec(log, *lsn, lcb);
 }
 
-static inline bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes)
+bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes)
 {
 	__le16 mask;
 	u32 min_de, de_off, used, total;
@@ -2642,9 +2641,10 @@ static inline bool check_index_root(const struct ATTRIB *attr,
 {
 	bool ret;
 	const struct INDEX_ROOT *root = resident_data(attr);
-	u8 index_bits = le32_to_cpu(root->index_block_size) >= sbi->cluster_size
-				? sbi->cluster_bits
-				: SECTOR_SHIFT;
+	u8 index_bits = le32_to_cpu(root->index_block_size) >=
+					sbi->cluster_size ?
+				      sbi->cluster_bits :
+				      SECTOR_SHIFT;
 	u8 block_clst = root->index_block_clst;
 
 	if (le32_to_cpu(attr->res.data_size) < sizeof(struct INDEX_ROOT) ||
@@ -3683,7 +3683,8 @@ move_data:
 
 	if (a_dirty) {
 		attr = oa->attr;
-		err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes, 0);
+		err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes,
+					0);
 		if (err)
 			goto out;
 	}
@@ -3768,11 +3769,10 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
 	if (!log)
 		return -ENOMEM;
 
-	memset(&rst_info, 0, sizeof(struct restart_info));
-
 	log->ni = ni;
 	log->l_size = l_size;
 	log->one_page_buf = kmalloc(page_size, GFP_NOFS);
+
 	if (!log->one_page_buf) {
 		err = -ENOMEM;
 		goto out;
@@ -3783,6 +3783,7 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
 	log->page_bits = blksize_bits(page_size);
 
 	/* Look for a restart area on the disk. */
+	memset(&rst_info, 0, sizeof(struct restart_info));
 	err = log_read_rst(log, l_size, true, &rst_info);
 	if (err)
 		goto out;
@@ -3859,10 +3860,10 @@ check_restart_area:
 	log->init_ra = !!rst_info.vbo;
 
 	/* If we have a valid page then grab a pointer to the restart area. */
-	ra2 = rst_info.valid_page
-		      ? Add2Ptr(rst_info.r_page,
-				le16_to_cpu(rst_info.r_page->ra_off))
-		      : NULL;
+	ra2 = rst_info.valid_page ?
+			    Add2Ptr(rst_info.r_page,
+			      le16_to_cpu(rst_info.r_page->ra_off)) :
+			    NULL;
 
 	if (rst_info.chkdsk_was_run ||
 	    (ra2 && ra2->client_idx[1] == LFS_NO_CLIENT_LE)) {
@@ -4256,6 +4257,10 @@ check_attribute_names:
 	rec_len -= t32;
 
 	attr_names = kmemdup(Add2Ptr(lrh, t32), rec_len, GFP_NOFS);
+	if (!attr_names) {
+		err = -ENOMEM;
+		goto out;
+	}
 
 	lcb_put(lcb);
 	lcb = NULL;
diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index 567563771bf8..28cc421102e5 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -172,8 +172,8 @@ int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes,
 	u16 sample, fo, fn;
 
 	fo = le16_to_cpu(rhdr->fix_off);
-	fn = simple ? ((bytes >> SECTOR_SHIFT) + 1)
-		    : le16_to_cpu(rhdr->fix_num);
+	fn = simple ? ((bytes >> SECTOR_SHIFT) + 1) :
+			    le16_to_cpu(rhdr->fix_num);
 
 	/* Check errors. */
 	if ((fo & 1) || fo + fn * sizeof(short) > SECTOR_SIZE || !fn-- ||
@@ -223,7 +223,7 @@ int ntfs_extend_init(struct ntfs_sb_info *sbi)
 	inode = ntfs_iget5(sb, &ref, &NAME_EXTEND);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
-		ntfs_err(sb, "Failed to load $Extend.");
+		ntfs_err(sb, "Failed to load $Extend (%d).", err);
 		inode = NULL;
 		goto out;
 	}
@@ -282,7 +282,7 @@ int ntfs_loadlog_and_replay(struct ntfs_inode *ni, struct ntfs_sb_info *sbi)
 
 	/* Check for 4GB. */
 	if (ni->vfs_inode.i_size >= 0x100000000ull) {
-		ntfs_err(sb, "\x24LogFile is too big");
+		ntfs_err(sb, "\x24LogFile is large than 4G.");
 		err = -EINVAL;
 		goto out;
 	}
@@ -646,13 +646,13 @@ next:
 						 NULL, 0, NULL, NULL))
 					goto next;
 
-				__clear_bit_le(ir - MFT_REC_RESERVED,
+				__clear_bit(ir - MFT_REC_RESERVED,
 					    &sbi->mft.reserved_bitmap);
 			}
 		}
 
 		/* Scan 5 bits for zero. Bit 0 == MFT_REC_RESERVED */
-		zbit = find_next_zero_bit_le(&sbi->mft.reserved_bitmap,
+		zbit = find_next_zero_bit(&sbi->mft.reserved_bitmap,
 					  MFT_REC_FREE, MFT_REC_RESERVED);
 		if (zbit >= MFT_REC_FREE) {
 			sbi->mft.next_reserved = MFT_REC_FREE;
@@ -720,7 +720,7 @@ found:
 	if (*rno >= MFT_REC_FREE)
 		wnd_set_used(wnd, *rno, 1);
 	else if (*rno >= MFT_REC_RESERVED && sbi->mft.reserved_bitmap_inited)
-		__set_bit_le(*rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap);
+		__set_bit(*rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap);
 
 out:
 	if (!mft)
@@ -748,7 +748,7 @@ void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft)
 		else
 			wnd_set_free(wnd, rno, 1);
 	} else if (rno >= MFT_REC_RESERVED && sbi->mft.reserved_bitmap_inited) {
-		__clear_bit_le(rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap);
+		__clear_bit(rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap);
 	}
 
 	if (rno < wnd_zone_bit(wnd))
@@ -846,18 +846,16 @@ void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait)
 {
 	int err;
 	struct super_block *sb = sbi->sb;
-	u32 blocksize;
+	u32 blocksize, bytes;
 	sector_t block1, block2;
-	u32 bytes;
 
-	if (!sb)
+	/*
+	 * sb can be NULL here. In this case sbi->flags should be 0 too.
+	 */
+	if (!sb || !(sbi->flags & NTFS_FLAGS_MFTMIRR))
 		return;
 
 	blocksize = sb->s_blocksize;
-
-	if (!(sbi->flags & NTFS_FLAGS_MFTMIRR))
-		return;
-
 	bytes = sbi->mft.recs_mirr << sbi->record_bits;
 	block1 = sbi->mft.lbo >> sb->s_blocksize_bits;
 	block2 = sbi->mft.lbo2 >> sb->s_blocksize_bits;
@@ -925,6 +923,7 @@ int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty)
 	struct VOLUME_INFO *info;
 	struct mft_inode *mi;
 	struct ntfs_inode *ni;
+	__le16 info_flags;
 
 	/*
 	 * Do not change state if fs was real_dirty.
@@ -957,6 +956,8 @@ int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty)
 		goto out;
 	}
 
+	info_flags = info->flags;
+
 	switch (dirty) {
 	case NTFS_DIRTY_ERROR:
 		ntfs_notice(sbi->sb, "Mark volume as dirty due to NTFS errors");
@@ -970,8 +971,10 @@ int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty)
 		break;
 	}
 	/* Cache current volume flags. */
-	sbi->volume.flags = info->flags;
-	mi->dirty = true;
+	if (info_flags != info->flags) {
+		sbi->volume.flags = info->flags;
+		mi->dirty = true;
+	}
 	err = 0;
 
 out:
@@ -1683,6 +1686,7 @@ struct ntfs_inode *ntfs_new_inode(struct ntfs_sb_info *sbi, CLST rno, bool dir)
 
 out:
 	if (err) {
+		make_bad_inode(inode);
 		iput(inode);
 		ni = ERR_PTR(err);
 	}
@@ -1859,7 +1863,7 @@ int ntfs_security_init(struct ntfs_sb_info *sbi)
 	inode = ntfs_iget5(sb, &ref, &NAME_SECURE);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
-		ntfs_err(sb, "Failed to load $Secure.");
+		ntfs_err(sb, "Failed to load $Secure (%d).", err);
 		inode = NULL;
 		goto out;
 	}
@@ -1870,41 +1874,43 @@ int ntfs_security_init(struct ntfs_sb_info *sbi)
 
 	attr = ni_find_attr(ni, NULL, &le, ATTR_ROOT, SDH_NAME,
 			    ARRAY_SIZE(SDH_NAME), NULL, NULL);
-	if (!attr) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	root_sdh = resident_data_ex(attr, sizeof(struct INDEX_ROOT));
-	if (root_sdh->type != ATTR_ZERO ||
+	if (!attr ||
+	    !(root_sdh = resident_data_ex(attr, sizeof(struct INDEX_ROOT))) ||
+	    root_sdh->type != ATTR_ZERO ||
 	    root_sdh->rule != NTFS_COLLATION_TYPE_SECURITY_HASH ||
-	    offsetof(struct INDEX_ROOT, ihdr) + root_sdh->ihdr.used > attr->res.data_size) {
+	    offsetof(struct INDEX_ROOT, ihdr) +
+			    le32_to_cpu(root_sdh->ihdr.used) >
+		    le32_to_cpu(attr->res.data_size)) {
+		ntfs_err(sb, "$Secure::$SDH is corrupted.");
 		err = -EINVAL;
 		goto out;
 	}
 
 	err = indx_init(indx_sdh, sbi, attr, INDEX_MUTEX_SDH);
-	if (err)
+	if (err) {
+		ntfs_err(sb, "Failed to initialize $Secure::$SDH (%d).", err);
 		goto out;
+	}
 
 	attr = ni_find_attr(ni, attr, &le, ATTR_ROOT, SII_NAME,
 			    ARRAY_SIZE(SII_NAME), NULL, NULL);
-	if (!attr) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	root_sii = resident_data_ex(attr, sizeof(struct INDEX_ROOT));
-	if (root_sii->type != ATTR_ZERO ||
+	if (!attr ||
+	    !(root_sii = resident_data_ex(attr, sizeof(struct INDEX_ROOT))) ||
+	    root_sii->type != ATTR_ZERO ||
 	    root_sii->rule != NTFS_COLLATION_TYPE_UINT ||
-	    offsetof(struct INDEX_ROOT, ihdr) + root_sii->ihdr.used > attr->res.data_size) {
+	    offsetof(struct INDEX_ROOT, ihdr) +
+			    le32_to_cpu(root_sii->ihdr.used) >
+		    le32_to_cpu(attr->res.data_size)) {
+		ntfs_err(sb, "$Secure::$SII is corrupted.");
 		err = -EINVAL;
 		goto out;
 	}
 
 	err = indx_init(indx_sii, sbi, attr, INDEX_MUTEX_SII);
-	if (err)
+	if (err) {
+		ntfs_err(sb, "Failed to initialize $Secure::$SII (%d).", err);
 		goto out;
+	}
 
 	fnd_sii = fnd_get();
 	if (!fnd_sii) {
@@ -2594,8 +2600,10 @@ static inline bool is_reserved_name(struct ntfs_sb_info *sbi,
 	if (len == 4 || (len > 4 && le16_to_cpu(name[4]) == '.')) {
 		port_digit = le16_to_cpu(name[3]);
 		if (port_digit >= '1' && port_digit <= '9')
-			if (!ntfs_cmp_names(name, 3, COM_NAME, 3, upcase, false) ||
-			    !ntfs_cmp_names(name, 3, LPT_NAME, 3, upcase, false))
+			if (!ntfs_cmp_names(name, 3, COM_NAME, 3, upcase,
+					    false) ||
+			    !ntfs_cmp_names(name, 3, LPT_NAME, 3, upcase,
+					    false))
 				return true;
 	}
 
diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c
index 51ab75954640..0a48d2d67219 100644
--- a/fs/ntfs3/index.c
+++ b/fs/ntfs3/index.c
@@ -431,8 +431,9 @@ next_run:
 		if (vbo + blocksize > data_size)
 			nbits = 8 * (data_size - vbo);
 
-		ok = nbits > from ? (*fn)((ulong *)bh->b_data, from, nbits, ret)
-				  : false;
+		ok = nbits > from ?
+				   (*fn)((ulong *)bh->b_data, from, nbits, ret) :
+				   false;
 		put_bh(bh);
 
 		if (ok) {
@@ -725,9 +726,13 @@ static struct NTFS_DE *hdr_find_e(const struct ntfs_index *indx,
 	u32 e_size, e_key_len;
 	u32 end = le32_to_cpu(hdr->used);
 	u32 off = le32_to_cpu(hdr->de_off);
+	u32 total = le32_to_cpu(hdr->total);
 	u16 offs[128];
 
 fill_table:
+	if (end > total)
+		return NULL;
+
 	if (off + sizeof(struct NTFS_DE) > end)
 		return NULL;
 
@@ -760,8 +765,7 @@ binary_search:
 				return NULL;
 
 			max_idx = 0;
-			table_size = min(table_size * 2,
-					 (int)ARRAY_SIZE(offs));
+			table_size = min(table_size * 2, (int)ARRAY_SIZE(offs));
 			goto fill_table;
 		}
 	} else if (diff2 < 0) {
@@ -844,6 +848,10 @@ static inline struct NTFS_DE *hdr_delete_de(struct INDEX_HDR *hdr,
 	u32 off = PtrOffset(hdr, re);
 	int bytes = used - (off + esize);
 
+	/* check INDEX_HDR valid before using INDEX_HDR */
+	if (!check_index_header(hdr, le32_to_cpu(hdr->total)))
+		return NULL;
+
 	if (off >= used || esize < sizeof(struct NTFS_DE) ||
 	    bytes < sizeof(struct NTFS_DE))
 		return NULL;
@@ -986,6 +994,7 @@ struct INDEX_ROOT *indx_get_root(struct ntfs_index *indx, struct ntfs_inode *ni,
 	struct ATTR_LIST_ENTRY *le = NULL;
 	struct ATTRIB *a;
 	const struct INDEX_NAMES *in = &s_index_names[indx->type];
+	struct INDEX_ROOT *root;
 
 	a = ni_find_attr(ni, NULL, &le, ATTR_ROOT, in->name, in->name_len, NULL,
 			 mi);
@@ -995,7 +1004,16 @@ struct INDEX_ROOT *indx_get_root(struct ntfs_index *indx, struct ntfs_inode *ni,
 	if (attr)
 		*attr = a;
 
-	return resident_data_ex(a, sizeof(struct INDEX_ROOT));
+	root = resident_data_ex(a, sizeof(struct INDEX_ROOT));
+
+	/* length check */
+	if (root &&
+	    offsetof(struct INDEX_ROOT, ihdr) + le32_to_cpu(root->ihdr.used) >
+		    le32_to_cpu(a->res.data_size)) {
+		return NULL;
+	}
+
+	return root;
 }
 
 static int indx_write(struct ntfs_index *indx, struct ntfs_inode *ni,
@@ -1085,7 +1103,8 @@ ok:
 	}
 
 	/* check for index header length */
-	if (offsetof(struct INDEX_BUFFER, ihdr) + ib->ihdr.used > bytes) {
+	if (offsetof(struct INDEX_BUFFER, ihdr) + le32_to_cpu(ib->ihdr.used) >
+	    bytes) {
 		err = -EINVAL;
 		goto out;
 	}
@@ -1151,8 +1170,10 @@ int indx_find(struct ntfs_index *indx, struct ntfs_inode *ni,
 
 		/* Read next level. */
 		err = indx_read(indx, ni, de_get_vbn(e), &node);
-		if (err)
+		if (err) {
+			/* io error? */
 			return err;
+		}
 
 		/* Lookup entry that is <= to the search value. */
 		e = hdr_find_e(indx, &node->index->ihdr, key, key_len, ctx,
@@ -1654,9 +1675,9 @@ static int indx_insert_into_root(struct ntfs_index *indx, struct ntfs_inode *ni,
 	mi->dirty = true;
 
 	/* Create alloc and bitmap attributes (if not). */
-	err = run_is_empty(&indx->alloc_run)
-		      ? indx_create_allocate(indx, ni, &new_vbn)
-		      : indx_add_allocate(indx, ni, &new_vbn);
+	err = run_is_empty(&indx->alloc_run) ?
+			    indx_create_allocate(indx, ni, &new_vbn) :
+			    indx_add_allocate(indx, ni, &new_vbn);
 
 	/* Layout of record may be changed, so rescan root. */
 	root = indx_get_root(indx, ni, &attr, &mi);
@@ -1759,10 +1780,11 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni,
 	struct indx_node *n1 = fnd->nodes[level];
 	struct INDEX_HDR *hdr1 = &n1->index->ihdr;
 	struct INDEX_HDR *hdr2;
-	u32 to_copy, used;
+	u32 to_copy, used, used1;
 	CLST new_vbn;
 	__le64 t_vbn, *sub_vbn;
 	u16 sp_size;
+	void *hdr1_saved = NULL;
 
 	/* Try the most easy case. */
 	e = fnd->level - 1 == level ? fnd->de[level] : NULL;
@@ -1795,6 +1817,13 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni,
 		return -ENOMEM;
 	memcpy(up_e, sp, sp_size);
 
+	used1 = le32_to_cpu(hdr1->used);
+	hdr1_saved = kmemdup(hdr1, used1, GFP_NOFS);
+	if (!hdr1_saved) {
+		err = -ENOMEM;
+		goto out;
+	}
+
 	if (!hdr1->flags) {
 		up_e->flags |= NTFS_IE_HAS_SUBNODES;
 		up_e->size = cpu_to_le16(sp_size + sizeof(u64));
@@ -1827,7 +1856,7 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni,
 	hdr_insert_head(hdr2, de_t, to_copy);
 
 	/* Remove all entries (sp including) from hdr1. */
-	used = le32_to_cpu(hdr1->used) - to_copy - sp_size;
+	used = used1 - to_copy - sp_size;
 	memmove(de_t, Add2Ptr(sp, sp_size), used - le32_to_cpu(hdr1->de_off));
 	hdr1->used = cpu_to_le32(used);
 
@@ -1838,9 +1867,9 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni,
 	hdr_insert_de(indx,
 		      (*indx->cmp)(new_de + 1, le16_to_cpu(new_de->key_size),
 				   up_e + 1, le16_to_cpu(up_e->key_size),
-				   ctx) < 0
-			      ? hdr2
-			      : hdr1,
+				   ctx) < 0 ?
+				    hdr2 :
+				    hdr1,
 		      new_de, NULL, ctx);
 
 	indx_mark_used(indx, ni, new_vbn >> indx->idx2vbn_bits);
@@ -1857,8 +1886,6 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni,
 	if (!level) {
 		/* Insert in root. */
 		err = indx_insert_into_root(indx, ni, up_e, NULL, ctx, fnd, 0);
-		if (err)
-			goto out;
 	} else {
 		/*
 		 * The target buffer's parent is another index buffer.
@@ -1866,12 +1893,20 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni,
 		 */
 		err = indx_insert_into_buffer(indx, ni, root, up_e, ctx,
 					      level - 1, fnd);
-		if (err)
-			goto out;
+	}
+
+	if (err) {
+		/*
+		 * Undo critical operations.
+		 */
+		indx_mark_free(indx, ni, new_vbn >> indx->idx2vbn_bits);
+		memcpy(hdr1, hdr1_saved, used1);
+		indx_write(indx, ni, n1, 0);
 	}
 
 out:
 	kfree(up_e);
+	kfree(hdr1_saved);
 
 	return err;
 }
@@ -1930,16 +1965,12 @@ int indx_insert_entry(struct ntfs_index *indx, struct ntfs_inode *ni,
 		 */
 		err = indx_insert_into_root(indx, ni, new_de, fnd->root_de, ctx,
 					    fnd, undo);
-		if (err)
-			goto out;
 	} else {
 		/*
 		 * Found a leaf buffer, so we'll insert the new entry into it.
 		 */
 		err = indx_insert_into_buffer(indx, ni, root, new_de, ctx,
 					      fnd->level - 1, fnd);
-		if (err)
-			goto out;
 	}
 
 out:
@@ -2308,8 +2339,8 @@ int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni,
 			err = level ? indx_insert_into_buffer(indx, ni, root,
 							      re, ctx,
 							      fnd->level - 1,
-							      fnd)
-				    : indx_insert_into_root(indx, ni, re, e,
+							      fnd) :
+					    indx_insert_into_root(indx, ni, re, e,
 							    ctx, fnd, 0);
 			kfree(re);
 
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 309d9b46b5d5..6c560245eef4 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -100,6 +100,12 @@ static struct inode *ntfs_read_mft(struct inode *inode,
 	/* Record should contain $I30 root. */
 	is_dir = rec->flags & RECORD_FLAG_DIR;
 
+	/* MFT_REC_MFT is not a dir */
+	if (is_dir && ino == MFT_REC_MFT) {
+		err = -EINVAL;
+		goto out;
+	}
+
 	inode->i_generation = le16_to_cpu(rec->seq);
 
 	/* Enumerate all struct Attributes MFT. */
@@ -131,7 +137,13 @@ next_attr:
 	rsize = attr->non_res ? 0 : le32_to_cpu(attr->res.data_size);
 	asize = le32_to_cpu(attr->size);
 
-	if (le16_to_cpu(attr->name_off) + attr->name_len > asize)
+	/*
+	 * Really this check was done in 'ni_enum_attr_ex' -> ... 'mi_enum_attr'.
+	 * There not critical to check this case again
+	 */
+	if (attr->name_len &&
+	    sizeof(short) * attr->name_len + le16_to_cpu(attr->name_off) >
+		    asize)
 		goto out;
 
 	if (attr->non_res) {
@@ -250,8 +262,8 @@ next_attr:
 		if (!attr->nres.alloc_size)
 			goto next_attr;
 
-		run = ino == MFT_REC_BITMAP ? &sbi->used.bitmap.run
-					    : &ni->file.run;
+		run = ino == MFT_REC_BITMAP ? &sbi->used.bitmap.run :
+						    &ni->file.run;
 		break;
 
 	case ATTR_ROOT:
@@ -259,7 +271,6 @@ next_attr:
 			goto out;
 
 		root = Add2Ptr(attr, roff);
-		is_root = true;
 
 		if (attr->name_len != ARRAY_SIZE(I30_NAME) ||
 		    memcmp(attr_name(attr), I30_NAME, sizeof(I30_NAME)))
@@ -272,15 +283,16 @@ next_attr:
 		if (!is_dir)
 			goto next_attr;
 
+		is_root = true;
 		ni->ni_flags |= NI_FLAG_DIR;
 
 		err = indx_init(&ni->dir, sbi, attr, INDEX_MUTEX_I30);
 		if (err)
 			goto out;
 
-		mode = sb->s_root
-			       ? (S_IFDIR | (0777 & sbi->options->fs_dmask_inv))
-			       : (S_IFDIR | 0777);
+		mode = sb->s_root ?
+				     (S_IFDIR | (0777 & sbi->options->fs_dmask_inv)) :
+				     (S_IFDIR | 0777);
 		goto next_attr;
 
 	case ATTR_ALLOC:
@@ -437,8 +449,8 @@ end_enum:
 		ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY;
 		inode->i_op = &ntfs_file_inode_operations;
 		inode->i_fop = &ntfs_file_operations;
-		inode->i_mapping->a_ops =
-			is_compressed(ni) ? &ntfs_aops_cmpr : &ntfs_aops;
+		inode->i_mapping->a_ops = is_compressed(ni) ? &ntfs_aops_cmpr :
+								    &ntfs_aops;
 		if (ino != MFT_REC_MFT)
 			init_rwsem(&ni->file.run_lock);
 	} else if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode) ||
@@ -636,6 +648,7 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
 			bh->b_size = block_size;
 			off = vbo & (PAGE_SIZE - 1);
 			set_bh_page(bh, page, off);
+
 			err = bh_read(bh, 0);
 			if (err < 0)
 				goto out;
@@ -773,8 +786,8 @@ static ssize_t ntfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	}
 
 	ret = blockdev_direct_IO(iocb, inode, iter,
-				 wr ? ntfs_get_block_direct_IO_W
-				    : ntfs_get_block_direct_IO_R);
+				 wr ? ntfs_get_block_direct_IO_W :
+					    ntfs_get_block_direct_IO_R);
 
 	if (ret > 0)
 		end = vbo + ret;
@@ -833,7 +846,7 @@ out:
 }
 
 static int ntfs_resident_writepage(struct folio *folio,
-		struct writeback_control *wbc, void *data)
+				   struct writeback_control *wbc, void *data)
 {
 	struct address_space *mapping = data;
 	struct ntfs_inode *ni = ntfs_i(mapping->host);
@@ -874,8 +887,8 @@ int ntfs_write_begin(struct file *file, struct address_space *mapping,
 
 	*pagep = NULL;
 	if (is_resident(ni)) {
-		struct page *page = grab_cache_page_write_begin(
-			mapping, pos >> PAGE_SHIFT);
+		struct page *page =
+			grab_cache_page_write_begin(mapping, pos >> PAGE_SHIFT);
 
 		if (!page) {
 			err = -ENOMEM;
@@ -907,9 +920,8 @@ out:
 /*
  * ntfs_write_end - Address_space_operations::write_end.
  */
-int ntfs_write_end(struct file *file, struct address_space *mapping,
-		   loff_t pos, u32 len, u32 copied, struct page *page,
-		   void *fsdata)
+int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
+		   u32 len, u32 copied, struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 	struct ntfs_inode *ni = ntfs_i(inode);
@@ -1307,8 +1319,7 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap,
 	inode_init_owner(idmap, inode, dir, mode);
 	mode = inode->i_mode;
 
-	inode->i_atime = inode->i_mtime = inode->i_ctime = ni->i_crtime =
-		current_time(inode);
+	ni->i_crtime = current_time(inode);
 
 	rec = ni->mi.mrec;
 	rec->hard_links = cpu_to_le16(1);
@@ -1349,10 +1360,9 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap,
 	attr->res.data_size = cpu_to_le32(dsize);
 
 	std5->cr_time = std5->m_time = std5->c_time = std5->a_time =
-		kernel2nt(&inode->i_atime);
+		kernel2nt(&ni->i_crtime);
 
-	ni->std_fa = fa;
-	std5->fa = fa;
+	std5->fa = ni->std_fa = fa;
 
 	attr = Add2Ptr(attr, asize);
 
@@ -1551,11 +1561,15 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap,
 			}
 
 			asize = SIZEOF_NONRESIDENT + ALIGN(err, 8);
+			/* Write non resident data. */
+			err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp,
+						nsize, 0);
+			if (err)
+				goto out5;
 		} else {
 			attr->res.data_off = SIZEOF_RESIDENT_LE;
 			attr->res.data_size = cpu_to_le32(nsize);
 			memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), rp, nsize);
-			nsize = 0;
 		}
 		/* Size of symlink equals the length of input string. */
 		inode->i_size = size;
@@ -1576,19 +1590,8 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap,
 	rec->used = cpu_to_le32(PtrOffset(rec, attr) + 8);
 	rec->next_attr_id = cpu_to_le16(aid);
 
-	/* Step 2: Add new name in index. */
-	err = indx_insert_entry(&dir_ni->dir, dir_ni, new_de, sbi, fnd, 0);
-	if (err)
-		goto out6;
-
-	/* Unlock parent directory before ntfs_init_acl. */
-	if (!fnd)
-		ni_unlock(dir_ni);
-
 	inode->i_generation = le16_to_cpu(rec->seq);
 
-	dir->i_mtime = dir->i_ctime = inode->i_atime;
-
 	if (S_ISDIR(mode)) {
 		inode->i_op = &ntfs_dir_inode_operations;
 		inode->i_fop = &ntfs_dir_operations;
@@ -1601,8 +1604,8 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap,
 	} else if (S_ISREG(mode)) {
 		inode->i_op = &ntfs_file_inode_operations;
 		inode->i_fop = &ntfs_file_operations;
-		inode->i_mapping->a_ops =
-			is_compressed(ni) ? &ntfs_aops_cmpr : &ntfs_aops;
+		inode->i_mapping->a_ops = is_compressed(ni) ? &ntfs_aops_cmpr :
+								    &ntfs_aops;
 		init_rwsem(&ni->file.run_lock);
 	} else {
 		inode->i_op = &ntfs_special_inode_operations;
@@ -1613,41 +1616,58 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap,
 	if (!S_ISLNK(mode) && (sb->s_flags & SB_POSIXACL)) {
 		err = ntfs_init_acl(idmap, inode, dir);
 		if (err)
-			goto out7;
+			goto out5;
 	} else
 #endif
 	{
 		inode->i_flags |= S_NOSEC;
 	}
 
-	/* Write non resident data. */
-	if (nsize) {
-		err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp, nsize, 0);
-		if (err)
-			goto out7;
+	/*
+	 * ntfs_init_acl and ntfs_save_wsl_perm update extended attribute.
+	 * The packed size of extended attribute is stored in direntry too.
+	 * 'fname' here points to inside new_de.
+	 */
+	ntfs_save_wsl_perm(inode, &fname->dup.ea_size);
+
+	/*
+	 * update ea_size in file_name attribute too.
+	 * Use ni_find_attr cause layout of MFT record may be changed
+	 * in ntfs_init_acl and ntfs_save_wsl_perm.
+	 */
+	attr = ni_find_attr(ni, NULL, NULL, ATTR_NAME, NULL, 0, NULL, NULL);
+	if (attr) {
+		struct ATTR_FILE_NAME *fn;
+
+		fn = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME);
+		if (fn)
+			fn->dup.ea_size = fname->dup.ea_size;
 	}
 
+	/* We do not need to update parent directory later */
+	ni->ni_flags &= ~NI_FLAG_UPDATE_PARENT;
+
+	/* Step 2: Add new name in index. */
+	err = indx_insert_entry(&dir_ni->dir, dir_ni, new_de, sbi, fnd, 0);
+	if (err)
+		goto out6;
+
 	/*
 	 * Call 'd_instantiate' after inode->i_op is set
 	 * but before finish_open.
 	 */
 	d_instantiate(dentry, inode);
 
-	ntfs_save_wsl_perm(inode);
+	/* Set original time. inode times (i_ctime) may be changed in ntfs_init_acl. */
+	inode->i_atime = inode->i_mtime = inode->i_ctime = dir->i_mtime =
+		dir->i_ctime = ni->i_crtime;
+
 	mark_inode_dirty(dir);
 	mark_inode_dirty(inode);
 
 	/* Normal exit. */
 	goto out2;
 
-out7:
-
-	/* Undo 'indx_insert_entry'. */
-	if (!fnd)
-		ni_lock_dir(dir_ni);
-	indx_delete_entry(&dir_ni->dir, dir_ni, new_de + 1,
-			  le16_to_cpu(new_de->key_size), sbi);
-	/* ni_unlock(dir_ni); will be called later. */
 out6:
 	if (rp_inserted)
 		ntfs_remove_reparse(sbi, IO_REPARSE_TAG_SYMLINK, &new_de->ref);
@@ -1669,11 +1689,11 @@ out2:
 	kfree(rp);
 
 out1:
-	if (err) {
-		if (!fnd)
-			ni_unlock(dir_ni);
+	if (!fnd)
+		ni_unlock(dir_ni);
+
+	if (err)
 		return ERR_PTR(err);
-	}
 
 	unlock_new_inode(inode);
 
@@ -1770,9 +1790,6 @@ void ntfs_evict_inode(struct inode *inode)
 {
 	truncate_inode_pages_final(&inode->i_data);
 
-	if (inode->i_nlink)
-		_ni_write_inode(inode, inode_needs_sync(inode));
-
 	invalidate_inode_buffers(inode);
 	clear_inode(inode);
 
@@ -2057,7 +2074,6 @@ const struct inode_operations ntfs_link_inode_operations = {
 	.get_link	= ntfs_get_link,
 	.setattr	= ntfs3_setattr,
 	.listxattr	= ntfs_listxattr,
-	.permission	= ntfs_permission,
 };
 
 const struct address_space_operations ntfs_aops = {
diff --git a/fs/ntfs3/lznt.c b/fs/ntfs3/lznt.c
index 28f654561f27..61e161c7c567 100644
--- a/fs/ntfs3/lznt.c
+++ b/fs/ntfs3/lznt.c
@@ -296,8 +296,8 @@ next:
  */
 struct lznt *get_lznt_ctx(int level)
 {
-	struct lznt *r = kzalloc(level ? offsetof(struct lznt, hash)
-				       : sizeof(struct lznt),
+	struct lznt *r = kzalloc(level ? offsetof(struct lznt, hash) :
+					       sizeof(struct lznt),
 				 GFP_NOFS);
 
 	if (r)
@@ -392,9 +392,9 @@ ssize_t decompress_lznt(const void *cmpr, size_t cmpr_size, void *unc,
 			unc_use = err;
 		} else {
 			/* This chunk does not contain compressed data. */
-			unc_use = unc_chunk + LZNT_CHUNK_SIZE > unc_end
-					  ? unc_end - unc_chunk
-					  : LZNT_CHUNK_SIZE;
+			unc_use = unc_chunk + LZNT_CHUNK_SIZE > unc_end ?
+						unc_end - unc_chunk :
+						LZNT_CHUNK_SIZE;
 
 			if (cmpr_chunk + sizeof(chunk_hdr) + unc_use >
 			    cmpr_end) {
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index 407fe92394e2..9736b1e4a0f6 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -88,6 +88,16 @@ static struct dentry *ntfs_lookup(struct inode *dir, struct dentry *dentry,
 		__putname(uni);
 	}
 
+	/*
+	 * Check for a null pointer
+	 * If the MFT record of ntfs inode is not a base record, inode->i_op can be NULL.
+	 * This causes null pointer dereference in d_splice_alias().
+	 */
+	if (!IS_ERR_OR_NULL(inode) && !inode->i_op) {
+		iput(inode);
+		inode = ERR_PTR(-EINVAL);
+	}
+
 	return d_splice_alias(inode, dentry);
 }
 
@@ -423,8 +433,8 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry,
 
 	inode = ntfs_create_inode(&nop_mnt_idmap, dir, dentry, uni, mode, 0,
 				  NULL, 0, fnd);
-	err = IS_ERR(inode) ? PTR_ERR(inode)
-			    : finish_open(file, dentry, ntfs_file_open);
+	err = IS_ERR(inode) ? PTR_ERR(inode) :
+				    finish_open(file, dentry, ntfs_file_open);
 	dput(d);
 
 out2:
@@ -597,8 +607,7 @@ const struct inode_operations ntfs_dir_inode_operations = {
 	.rmdir		= ntfs_rmdir,
 	.mknod		= ntfs_mknod,
 	.rename		= ntfs_rename,
-	.permission	= ntfs_permission,
-	.get_inode_acl	= ntfs_get_acl,
+	.get_acl	= ntfs_get_acl,
 	.set_acl	= ntfs_set_acl,
 	.setattr	= ntfs3_setattr,
 	.getattr	= ntfs_getattr,
@@ -611,7 +620,7 @@ const struct inode_operations ntfs_special_inode_operations = {
 	.setattr	= ntfs3_setattr,
 	.getattr	= ntfs_getattr,
 	.listxattr	= ntfs_listxattr,
-	.get_inode_acl	= ntfs_get_acl,
+	.get_acl	= ntfs_get_acl,
 	.set_acl	= ntfs_set_acl,
 };
 
diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h
index 86ea1826d099..90151e56c122 100644
--- a/fs/ntfs3/ntfs.h
+++ b/fs/ntfs3/ntfs.h
@@ -435,9 +435,6 @@ static inline u64 attr_svcn(const struct ATTRIB *attr)
 	return attr->non_res ? le64_to_cpu(attr->nres.svcn) : 0;
 }
 
-/* The size of resident attribute by its resident size. */
-#define BYTES_PER_RESIDENT(b) (0x18 + (b))
-
 static_assert(sizeof(struct ATTRIB) == 0x48);
 static_assert(sizeof(((struct ATTRIB *)NULL)->res) == 0x08);
 static_assert(sizeof(((struct ATTRIB *)NULL)->nres) == 0x38);
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index 80072e5f96f7..eb01f7e76479 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -100,7 +100,6 @@ struct ntfs_mount_options {
 	unsigned hide_dot_files : 1; /* Set hidden flag on dot files. */
 	unsigned windows_names : 1; /* Disallow names forbidden by Windows. */
 	unsigned force : 1; /* RW mount dirty volume. */
-	unsigned noacsrules : 1; /* Exclude acs rules. */
 	unsigned prealloc : 1; /* Preallocate space when file is growing. */
 	unsigned nocase : 1; /* case insensitive. */
 };
@@ -164,7 +163,6 @@ struct wnd_bitmap {
 	size_t zone_bit;
 	size_t zone_end;
 
-	bool set_tail; // Not necessary in driver.
 	bool inited;
 };
 
@@ -340,7 +338,7 @@ enum ntfs_inode_mutex_lock_class {
 };
 
 /*
- * sturct ntfs_inode
+ * struct ntfs_inode
  *
  * Ntfs inode - extends linux inode. consists of one or more MFT inodes.
  */
@@ -581,6 +579,7 @@ int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni,
 bool ni_is_dirty(struct inode *inode);
 
 /* Globals from fslog.c */
+bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes);
 int log_replay(struct ntfs_inode *ni, bool *initialized);
 
 /* Globals from fsntfs.c */
@@ -700,9 +699,8 @@ int ntfs_get_block(struct inode *inode, sector_t vbn,
 		   struct buffer_head *bh_result, int create);
 int ntfs_write_begin(struct file *file, struct address_space *mapping,
 		     loff_t pos, u32 len, struct page **pagep, void **fsdata);
-int ntfs_write_end(struct file *file, struct address_space *mapping,
-		   loff_t pos, u32 len, u32 copied, struct page *page,
-		   void *fsdata);
+int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
+		   u32 len, u32 copied, struct page *page, void *fsdata);
 int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc);
 int ntfs_sync_inode(struct inode *inode);
 int ntfs_flush_inodes(struct super_block *sb, struct inode *i1,
@@ -858,23 +856,22 @@ unsigned long ntfs_names_hash(const u16 *name, size_t len, const u16 *upcase,
 
 /* globals from xattr.c */
 #ifdef CONFIG_NTFS3_FS_POSIX_ACL
-struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu);
+struct posix_acl *ntfs_get_acl(struct mnt_idmap *idmap,
+			       struct dentry *dentry, int type);
 int ntfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
 		 struct posix_acl *acl, int type);
 int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode,
-		  struct inode *dir);
+		 struct inode *dir);
 #else
 #define ntfs_get_acl NULL
 #define ntfs_set_acl NULL
 #endif
 
 int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry);
-int ntfs_permission(struct mnt_idmap *idmap, struct inode *inode,
-		    int mask);
 ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 extern const struct xattr_handler *ntfs_xattr_handlers[];
 
-int ntfs_save_wsl_perm(struct inode *inode);
+int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size);
 void ntfs_get_wsl_perm(struct inode *inode);
 
 /* globals from lznt.c */
diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c
index defce6a5c8e1..2a281cead2bc 100644
--- a/fs/ntfs3/record.c
+++ b/fs/ntfs3/record.c
@@ -221,7 +221,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
 		}
 
 		if (off + asize < off) {
-			/* overflow check */
+			/* Overflow check. */
 			return NULL;
 		}
 
@@ -247,8 +247,8 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
 	if ((t32 & 0xf) || (t32 > 0x100))
 		return NULL;
 
-	/* Check boundary. */
-	if (off + asize > used)
+	/* Check overflow and boundary. */
+	if (off + asize < off || off + asize > used)
 		return NULL;
 
 	/* Check size of attribute. */
@@ -419,10 +419,9 @@ struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type,
 	struct ntfs_sb_info *sbi = mi->sbi;
 	u32 used = le32_to_cpu(rec->used);
 	const u16 *upcase = sbi->upcase;
-	int diff;
 
 	/* Can we insert mi attribute? */
-	if (used + asize > mi->sbi->record_size)
+	if (used + asize > sbi->record_size)
 		return NULL;
 
 	/*
@@ -431,7 +430,7 @@ struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type,
 	 */
 	attr = NULL;
 	while ((attr = mi_enum_attr(mi, attr))) {
-		diff = compare_attr(attr, type, name, name_len, upcase);
+		int diff = compare_attr(attr, type, name, name_len, upcase);
 
 		if (diff < 0)
 			continue;
@@ -442,9 +441,11 @@ struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type,
 	}
 
 	if (!attr) {
-		tail = 8; /* Not used, just to suppress warning. */
+		/* Append. */
+		tail = 8;
 		attr = Add2Ptr(rec, used - 8);
 	} else {
+		/* Insert before 'attr'. */
 		tail = used - PtrOffset(rec, attr);
 	}
 
diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c
index a5af71cd8d14..47612d16c027 100644
--- a/fs/ntfs3/run.c
+++ b/fs/ntfs3/run.c
@@ -433,9 +433,9 @@ requires_new_range:
 			should_add_tail = Tovcn < r->len;
 
 			if (should_add_tail) {
-				tail_lcn = r->lcn == SPARSE_LCN
-						   ? SPARSE_LCN
-						   : (r->lcn + Tovcn);
+				tail_lcn = r->lcn == SPARSE_LCN ?
+							 SPARSE_LCN :
+							 (r->lcn + Tovcn);
 				tail_vcn = r->vcn + Tovcn;
 				tail_len = r->len - Tovcn;
 			}
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index ef4ea3f21905..5158dd31fd97 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -39,10 +39,10 @@
  * To mount large volumes as ntfs one should use large cluster size (up to 2M)
  * The maximum volume size in this case is 2^32 * 2^21 = 2^53 = 8P
  *
- *     ntfs limits, cluster size is 2M (2^31)
+ *     ntfs limits, cluster size is 2M (2^21)
  * -----------------------------------------------------------------------------
- * | < 8P, 2^54    |  < 2^32  |  yes     |  yes   |   yes    |  yes   |  yes   |
- * | > 8P, 2^54    |  > 2^32  |  no      |  no    |   yes    |  yes   |  yes   |
+ * | < 8P, 2^53    |  < 2^32  |  yes     |  yes   |   yes    |  yes   |  yes   |
+ * | > 8P, 2^53    |  > 2^32  |  no      |  no    |   yes    |  yes   |  yes   |
  * ----------------------------------------------------------|------------------
  *
  */
@@ -115,9 +115,9 @@ void ntfs_inode_printk(struct inode *inode, const char *fmt, ...)
 		return;
 
 	/* Use static allocated buffer, if possible. */
-	name = atomic_dec_and_test(&s_name_buf_cnt)
-		       ? s_name_buf
-		       : kmalloc(sizeof(s_name_buf), GFP_NOFS);
+	name = atomic_dec_and_test(&s_name_buf_cnt) ?
+			     s_name_buf :
+			     kmalloc(sizeof(s_name_buf), GFP_NOFS);
 
 	if (name) {
 		struct dentry *de = d_find_alias(inode);
@@ -253,7 +253,6 @@ enum Opt {
 	Opt_acl,
 	Opt_iocharset,
 	Opt_prealloc,
-	Opt_noacsrules,
 	Opt_nocase,
 	Opt_err,
 };
@@ -271,12 +270,11 @@ static const struct fs_parameter_spec ntfs_fs_parameters[] = {
 	fsparam_flag_no("hidden",		Opt_nohidden),
 	fsparam_flag_no("hide_dot_files",	Opt_hide_dot_files),
 	fsparam_flag_no("windows_names",	Opt_windows_names),
-	fsparam_flag_no("acl",			Opt_acl),
 	fsparam_flag_no("showmeta",		Opt_showmeta),
+	fsparam_flag_no("acl",			Opt_acl),
+	fsparam_string("iocharset",		Opt_iocharset),
 	fsparam_flag_no("prealloc",		Opt_prealloc),
-	fsparam_flag_no("acsrules",		Opt_noacsrules),
 	fsparam_flag_no("nocase",		Opt_nocase),
-	fsparam_string("iocharset",		Opt_iocharset),
 	{}
 };
 
@@ -366,19 +364,20 @@ static int ntfs_fs_parse_param(struct fs_context *fc,
 	case Opt_windows_names:
 		opts->windows_names = result.negated ? 0 : 1;
 		break;
+	case Opt_showmeta:
+		opts->showmeta = result.negated ? 0 : 1;
+		break;
 	case Opt_acl:
 		if (!result.negated)
 #ifdef CONFIG_NTFS3_FS_POSIX_ACL
 			fc->sb_flags |= SB_POSIXACL;
 #else
-			return invalf(fc, "ntfs3: Support for ACL not compiled in!");
+			return invalf(
+				fc, "ntfs3: Support for ACL not compiled in!");
 #endif
 		else
 			fc->sb_flags &= ~SB_POSIXACL;
 		break;
-	case Opt_showmeta:
-		opts->showmeta = result.negated ? 0 : 1;
-		break;
 	case Opt_iocharset:
 		kfree(opts->nls_name);
 		opts->nls_name = param->string;
@@ -387,9 +386,6 @@ static int ntfs_fs_parse_param(struct fs_context *fc,
 	case Opt_prealloc:
 		opts->prealloc = result.negated ? 0 : 1;
 		break;
-	case Opt_noacsrules:
-		opts->noacsrules = result.negated ? 1 : 0;
-		break;
 	case Opt_nocase:
 		opts->nocase = result.negated ? 1 : 0;
 		break;
@@ -409,24 +405,29 @@ static int ntfs_fs_reconfigure(struct fs_context *fc)
 
 	ro_rw = sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY);
 	if (ro_rw && (sbi->flags & NTFS_FLAGS_NEED_REPLAY)) {
-		errorf(fc, "ntfs3: Couldn't remount rw because journal is not replayed. Please umount/remount instead\n");
+		errorf(fc,
+		       "ntfs3: Couldn't remount rw because journal is not replayed. Please umount/remount instead\n");
 		return -EINVAL;
 	}
 
 	new_opts->nls = ntfs_load_nls(new_opts->nls_name);
 	if (IS_ERR(new_opts->nls)) {
 		new_opts->nls = NULL;
-		errorf(fc, "ntfs3: Cannot load iocharset %s", new_opts->nls_name);
+		errorf(fc, "ntfs3: Cannot load iocharset %s",
+		       new_opts->nls_name);
 		return -EINVAL;
 	}
 	if (new_opts->nls != sbi->options->nls)
-		return invalf(fc, "ntfs3: Cannot use different iocharset when remounting!");
+		return invalf(
+			fc,
+			"ntfs3: Cannot use different iocharset when remounting!");
 
 	sync_filesystem(sb);
 
 	if (ro_rw && (sbi->volume.flags & VOLUME_FLAG_DIRTY) &&
 	    !new_opts->force) {
-		errorf(fc, "ntfs3: Volume is dirty and \"force\" flag is not set!");
+		errorf(fc,
+		       "ntfs3: Volume is dirty and \"force\" flag is not set!");
 		return -EINVAL;
 	}
 
@@ -544,40 +545,38 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root)
 	struct ntfs_mount_options *opts = sbi->options;
 	struct user_namespace *user_ns = seq_user_ns(m);
 
-	seq_printf(m, ",uid=%u",
-		  from_kuid_munged(user_ns, opts->fs_uid));
-	seq_printf(m, ",gid=%u",
-		  from_kgid_munged(user_ns, opts->fs_gid));
-	if (opts->fmask)
-		seq_printf(m, ",fmask=%04o", opts->fs_fmask_inv ^ 0xffff);
+	seq_printf(m, ",uid=%u", from_kuid_munged(user_ns, opts->fs_uid));
+	seq_printf(m, ",gid=%u", from_kgid_munged(user_ns, opts->fs_gid));
 	if (opts->dmask)
 		seq_printf(m, ",dmask=%04o", opts->fs_dmask_inv ^ 0xffff);
-	if (opts->nls)
-		seq_printf(m, ",iocharset=%s", opts->nls->charset);
-	else
-		seq_puts(m, ",iocharset=utf8");
+	if (opts->fmask)
+		seq_printf(m, ",fmask=%04o", opts->fs_fmask_inv ^ 0xffff);
 	if (opts->sys_immutable)
 		seq_puts(m, ",sys_immutable");
 	if (opts->discard)
 		seq_puts(m, ",discard");
+	if (opts->force)
+		seq_puts(m, ",force");
 	if (opts->sparse)
 		seq_puts(m, ",sparse");
-	if (opts->showmeta)
-		seq_puts(m, ",showmeta");
 	if (opts->nohidden)
 		seq_puts(m, ",nohidden");
-	if (opts->windows_names)
-		seq_puts(m, ",windows_names");
 	if (opts->hide_dot_files)
 		seq_puts(m, ",hide_dot_files");
-	if (opts->force)
-		seq_puts(m, ",force");
-	if (opts->noacsrules)
-		seq_puts(m, ",noacsrules");
-	if (opts->prealloc)
-		seq_puts(m, ",prealloc");
+	if (opts->windows_names)
+		seq_puts(m, ",windows_names");
+	if (opts->showmeta)
+		seq_puts(m, ",showmeta");
 	if (sb->s_flags & SB_POSIXACL)
 		seq_puts(m, ",acl");
+	if (opts->nls)
+		seq_printf(m, ",iocharset=%s", opts->nls->charset);
+	else
+		seq_puts(m, ",iocharset=utf8");
+	if (opts->prealloc)
+		seq_puts(m, ",prealloc");
+	if (opts->nocase)
+		seq_puts(m, ",nocase");
 
 	return 0;
 }
@@ -706,7 +705,7 @@ static u32 true_sectors_per_clst(const struct NTFS_BOOT *boot)
 	if (boot->sectors_per_clusters <= 0x80)
 		return boot->sectors_per_clusters;
 	if (boot->sectors_per_clusters >= 0xf4) /* limit shift to 2MB max */
-		return 1U << -(s8)boot->sectors_per_clusters;
+		return 1U << (-(s8)boot->sectors_per_clusters);
 	return -EINVAL;
 }
 
@@ -724,6 +723,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
 	struct buffer_head *bh;
 	struct MFT_REC *rec;
 	u16 fn, ao;
+	u8 cluster_bits;
 
 	sbi->volume.blocks = dev_size >> PAGE_SHIFT;
 
@@ -734,48 +734,81 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
 	err = -EINVAL;
 	boot = (struct NTFS_BOOT *)bh->b_data;
 
-	if (memcmp(boot->system_id, "NTFS    ", sizeof("NTFS    ") - 1))
+	if (memcmp(boot->system_id, "NTFS    ", sizeof("NTFS    ") - 1)) {
+		ntfs_err(sb, "Boot's signature is not NTFS.");
 		goto out;
+	}
 
 	/* 0x55AA is not mandaroty. Thanks Maxim Suhanov*/
 	/*if (0x55 != boot->boot_magic[0] || 0xAA != boot->boot_magic[1])
 	 *	goto out;
 	 */
 
-	boot_sector_size = (u32)boot->bytes_per_sector[1] << 8;
-	if (boot->bytes_per_sector[0] || boot_sector_size < SECTOR_SIZE ||
+	boot_sector_size = ((u32)boot->bytes_per_sector[1] << 8) |
+			   boot->bytes_per_sector[0];
+	if (boot_sector_size < SECTOR_SIZE ||
 	    !is_power_of_2(boot_sector_size)) {
+		ntfs_err(sb, "Invalid bytes per sector %u.", boot_sector_size);
 		goto out;
 	}
 
 	/* cluster size: 512, 1K, 2K, 4K, ... 2M */
 	sct_per_clst = true_sectors_per_clst(boot);
-	if ((int)sct_per_clst < 0)
-		goto out;
-	if (!is_power_of_2(sct_per_clst))
+	if ((int)sct_per_clst < 0 || !is_power_of_2(sct_per_clst)) {
+		ntfs_err(sb, "Invalid sectors per cluster %u.", sct_per_clst);
 		goto out;
+	}
+
+	sbi->cluster_size = boot_sector_size * sct_per_clst;
+	sbi->cluster_bits = cluster_bits = blksize_bits(sbi->cluster_size);
+	sbi->cluster_mask = sbi->cluster_size - 1;
+	sbi->cluster_mask_inv = ~(u64)sbi->cluster_mask;
 
 	mlcn = le64_to_cpu(boot->mft_clst);
 	mlcn2 = le64_to_cpu(boot->mft2_clst);
 	sectors = le64_to_cpu(boot->sectors_per_volume);
 
-	if (mlcn * sct_per_clst >= sectors)
+	if (mlcn * sct_per_clst >= sectors || mlcn2 * sct_per_clst >= sectors) {
+		ntfs_err(
+			sb,
+			"Start of MFT 0x%llx (0x%llx) is out of volume 0x%llx.",
+			mlcn, mlcn2, sectors);
 		goto out;
+	}
 
-	if (mlcn2 * sct_per_clst >= sectors)
-		goto out;
+	sbi->record_size = record_size =
+		boot->record_size < 0 ? 1 << (-boot->record_size) :
+					      (u32)boot->record_size << cluster_bits;
+	sbi->record_bits = blksize_bits(record_size);
+	sbi->attr_size_tr = (5 * record_size >> 4); // ~320 bytes
 
 	/* Check MFT record size. */
-	if ((boot->record_size < 0 &&
-	     SECTOR_SIZE > (2U << (-boot->record_size))) ||
-	    (boot->record_size >= 0 && !is_power_of_2(boot->record_size))) {
+	if (record_size < SECTOR_SIZE || !is_power_of_2(record_size)) {
+		ntfs_err(sb, "Invalid bytes per MFT record %u (%d).",
+			 record_size, boot->record_size);
+		goto out;
+	}
+
+	if (record_size > MAXIMUM_BYTES_PER_MFT) {
+		ntfs_err(sb, "Unsupported bytes per MFT record %u.",
+			 record_size);
 		goto out;
 	}
 
+	sbi->index_size = boot->index_size < 0 ?
+					1u << (-boot->index_size) :
+					(u32)boot->index_size << cluster_bits;
+
 	/* Check index record size. */
-	if ((boot->index_size < 0 &&
-	     SECTOR_SIZE > (2U << (-boot->index_size))) ||
-	    (boot->index_size >= 0 && !is_power_of_2(boot->index_size))) {
+	if (sbi->index_size < SECTOR_SIZE || !is_power_of_2(sbi->index_size)) {
+		ntfs_err(sb, "Invalid bytes per index %u(%d).", sbi->index_size,
+			 boot->index_size);
+		goto out;
+	}
+
+	if (sbi->index_size > MAXIMUM_BYTES_PER_INDEX) {
+		ntfs_err(sb, "Unsupported bytes per index %u.",
+			 sbi->index_size);
 		goto out;
 	}
 
@@ -791,53 +824,36 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
 	if (boot_sector_size != sector_size) {
 		ntfs_warn(
 			sb,
-			"Different NTFS' sector size (%u) and media sector size (%u)",
+			"Different NTFS sector size (%u) and media sector size (%u).",
 			boot_sector_size, sector_size);
 		dev_size += sector_size - 1;
 	}
 
-	sbi->cluster_size = boot_sector_size * sct_per_clst;
-	sbi->cluster_bits = blksize_bits(sbi->cluster_size);
-
-	sbi->mft.lbo = mlcn << sbi->cluster_bits;
-	sbi->mft.lbo2 = mlcn2 << sbi->cluster_bits;
+	sbi->mft.lbo = mlcn << cluster_bits;
+	sbi->mft.lbo2 = mlcn2 << cluster_bits;
 
 	/* Compare boot's cluster and sector. */
-	if (sbi->cluster_size < boot_sector_size)
+	if (sbi->cluster_size < boot_sector_size) {
+		ntfs_err(sb, "Invalid bytes per cluster (%u).",
+			 sbi->cluster_size);
 		goto out;
+	}
 
 	/* Compare boot's cluster and media sector. */
 	if (sbi->cluster_size < sector_size) {
 		/* No way to use ntfs_get_block in this case. */
 		ntfs_err(
 			sb,
-			"Failed to mount 'cause NTFS's cluster size (%u) is less than media sector size (%u)",
+			"Failed to mount 'cause NTFS's cluster size (%u) is less than media sector size (%u).",
 			sbi->cluster_size, sector_size);
 		goto out;
 	}
 
-	sbi->cluster_mask = sbi->cluster_size - 1;
-	sbi->cluster_mask_inv = ~(u64)sbi->cluster_mask;
-	sbi->record_size = record_size = boot->record_size < 0
-						 ? 1 << (-boot->record_size)
-						 : (u32)boot->record_size
-							   << sbi->cluster_bits;
-
-	if (record_size > MAXIMUM_BYTES_PER_MFT || record_size < SECTOR_SIZE)
-		goto out;
-
-	sbi->record_bits = blksize_bits(record_size);
-	sbi->attr_size_tr = (5 * record_size >> 4); // ~320 bytes
-
 	sbi->max_bytes_per_attr =
 		record_size - ALIGN(MFTRECORD_FIXUP_OFFSET_1, 8) -
 		ALIGN(((record_size >> SECTOR_SHIFT) * sizeof(short)), 8) -
 		ALIGN(sizeof(enum ATTR_TYPE), 8);
 
-	sbi->index_size = boot->index_size < 0
-				  ? 1u << (-boot->index_size)
-				  : (u32)boot->index_size << sbi->cluster_bits;
-
 	sbi->volume.ser_num = le64_to_cpu(boot->serial_num);
 
 	/* Warning if RAW volume. */
@@ -847,18 +863,18 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
 		gb0 = format_size_gb(dev_size, &mb0);
 		ntfs_warn(
 			sb,
-			"RAW NTFS volume: Filesystem size %u.%02u Gb > volume size %u.%02u Gb. Mount in read-only",
+			"RAW NTFS volume: Filesystem size %u.%02u Gb > volume size %u.%02u Gb. Mount in read-only.",
 			gb, mb, gb0, mb0);
 		sb->s_flags |= SB_RDONLY;
 	}
 
-	clusters = sbi->volume.size >> sbi->cluster_bits;
+	clusters = sbi->volume.size >> cluster_bits;
 #ifndef CONFIG_NTFS3_64BIT_CLUSTER
 	/* 32 bits per cluster. */
 	if (clusters >> 32) {
 		ntfs_notice(
 			sb,
-			"NTFS %u.%02u Gb is too big to use 32 bits per cluster",
+			"NTFS %u.%02u Gb is too big to use 32 bits per cluster.",
 			gb, mb);
 		goto out;
 	}
@@ -892,17 +908,17 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
 	sbi->volume.blocks = sbi->volume.size >> sb->s_blocksize_bits;
 
 	/* Maximum size for normal files. */
-	sbi->maxbytes = (clusters << sbi->cluster_bits) - 1;
+	sbi->maxbytes = (clusters << cluster_bits) - 1;
 
 #ifdef CONFIG_NTFS3_64BIT_CLUSTER
-	if (clusters >= (1ull << (64 - sbi->cluster_bits)))
+	if (clusters >= (1ull << (64 - cluster_bits)))
 		sbi->maxbytes = -1;
 	sbi->maxbytes_sparse = -1;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 #else
 	/* Maximum size for sparse file. */
-	sbi->maxbytes_sparse = (1ull << (sbi->cluster_bits + 32)) - 1;
-	sb->s_maxbytes = 0xFFFFFFFFull << sbi->cluster_bits;
+	sbi->maxbytes_sparse = (1ull << (cluster_bits + 32)) - 1;
+	sb->s_maxbytes = 0xFFFFFFFFull << cluster_bits;
 #endif
 
 	/*
@@ -910,7 +926,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
 	 * It would be nice if we are able to allocate 1/8 of
 	 * total clusters for MFT but not more then 512 MB.
 	 */
-	sbi->zone_max = min_t(CLST, 0x20000000 >> sbi->cluster_bits, clusters >> 3);
+	sbi->zone_max = min_t(CLST, 0x20000000 >> cluster_bits, clusters >> 3);
 
 	err = 0;
 
@@ -928,6 +944,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	int err;
 	struct ntfs_sb_info *sbi = sb->s_fs_info;
 	struct block_device *bdev = sb->s_bdev;
+	struct ntfs_mount_options *options;
 	struct inode *inode;
 	struct ntfs_inode *ni;
 	size_t i, tt, bad_len, bad_frags;
@@ -942,7 +959,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	ref.high = 0;
 
 	sbi->sb = sb;
-	sbi->options = fc->fs_private;
+	sbi->options = options = fc->fs_private;
 	fc->fs_private = NULL;
 	sb->s_flags |= SB_NODIRATIME;
 	sb->s_magic = 0x7366746e; // "ntfs"
@@ -950,12 +967,12 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_export_op = &ntfs_export_ops;
 	sb->s_time_gran = NTFS_TIME_GRAN; // 100 nsec
 	sb->s_xattr = ntfs_xattr_handlers;
-	sb->s_d_op = sbi->options->nocase ? &ntfs_dentry_ops : NULL;
+	sb->s_d_op = options->nocase ? &ntfs_dentry_ops : NULL;
 
-	sbi->options->nls = ntfs_load_nls(sbi->options->nls_name);
-	if (IS_ERR(sbi->options->nls)) {
-		sbi->options->nls = NULL;
-		errorf(fc, "Cannot load nls %s", sbi->options->nls_name);
+	options->nls = ntfs_load_nls(options->nls_name);
+	if (IS_ERR(options->nls)) {
+		options->nls = NULL;
+		errorf(fc, "Cannot load nls %s", options->nls_name);
 		err = -EINVAL;
 		goto out;
 	}
@@ -980,8 +997,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	ref.seq = cpu_to_le16(MFT_REC_VOL);
 	inode = ntfs_iget5(sb, &ref, &NAME_VOLUME);
 	if (IS_ERR(inode)) {
-		ntfs_err(sb, "Failed to load $Volume.");
 		err = PTR_ERR(inode);
+		ntfs_err(sb, "Failed to load $Volume (%d).", err);
 		goto out;
 	}
 
@@ -1007,13 +1024,9 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	}
 
 	attr = ni_find_attr(ni, attr, NULL, ATTR_VOL_INFO, NULL, 0, NULL, NULL);
-	if (!attr || is_attr_ext(attr)) {
-		err = -EINVAL;
-		goto put_inode_out;
-	}
-
-	info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO);
-	if (!info) {
+	if (!attr || is_attr_ext(attr) ||
+	    !(info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO))) {
+		ntfs_err(sb, "$Volume is corrupted.");
 		err = -EINVAL;
 		goto put_inode_out;
 	}
@@ -1028,13 +1041,13 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	ref.seq = cpu_to_le16(MFT_REC_MIRR);
 	inode = ntfs_iget5(sb, &ref, &NAME_MIRROR);
 	if (IS_ERR(inode)) {
-		ntfs_err(sb, "Failed to load $MFTMirr.");
 		err = PTR_ERR(inode);
+		ntfs_err(sb, "Failed to load $MFTMirr (%d).", err);
 		goto out;
 	}
 
-	sbi->mft.recs_mirr =
-		ntfs_up_cluster(sbi, inode->i_size) >> sbi->record_bits;
+	sbi->mft.recs_mirr = ntfs_up_cluster(sbi, inode->i_size) >>
+			     sbi->record_bits;
 
 	iput(inode);
 
@@ -1043,8 +1056,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	ref.seq = cpu_to_le16(MFT_REC_LOG);
 	inode = ntfs_iget5(sb, &ref, &NAME_LOGFILE);
 	if (IS_ERR(inode)) {
-		ntfs_err(sb, "Failed to load \x24LogFile.");
 		err = PTR_ERR(inode);
+		ntfs_err(sb, "Failed to load \x24LogFile (%d).", err);
 		goto out;
 	}
 
@@ -1064,7 +1077,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
 			goto out;
 		}
 	} else if (sbi->volume.flags & VOLUME_FLAG_DIRTY) {
-		if (!sb_rdonly(sb) && !sbi->options->force) {
+		if (!sb_rdonly(sb) && !options->force) {
 			ntfs_warn(
 				sb,
 				"volume is dirty and \"force\" flag is not set!");
@@ -1079,8 +1092,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
 
 	inode = ntfs_iget5(sb, &ref, &NAME_MFT);
 	if (IS_ERR(inode)) {
-		ntfs_err(sb, "Failed to load $MFT.");
 		err = PTR_ERR(inode);
+		ntfs_err(sb, "Failed to load $MFT (%d).", err);
 		goto out;
 	}
 
@@ -1095,8 +1108,10 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
 		goto put_inode_out;
 
 	err = ni_load_all_mi(ni);
-	if (err)
+	if (err) {
+		ntfs_err(sb, "Failed to load $MFT's subrecords (%d).", err);
 		goto put_inode_out;
+	}
 
 	sbi->mft.ni = ni;
 
@@ -1105,8 +1120,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	ref.seq = cpu_to_le16(MFT_REC_BITMAP);
 	inode = ntfs_iget5(sb, &ref, &NAME_BITMAP);
 	if (IS_ERR(inode)) {
-		ntfs_err(sb, "Failed to load $Bitmap.");
 		err = PTR_ERR(inode);
+		ntfs_err(sb, "Failed to load $Bitmap (%d).", err);
 		goto out;
 	}
 
@@ -1120,22 +1135,25 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	/* Check bitmap boundary. */
 	tt = sbi->used.bitmap.nbits;
 	if (inode->i_size < bitmap_size(tt)) {
+		ntfs_err(sb, "$Bitmap is corrupted.");
 		err = -EINVAL;
 		goto put_inode_out;
 	}
 
-	/* Not necessary. */
-	sbi->used.bitmap.set_tail = true;
 	err = wnd_init(&sbi->used.bitmap, sb, tt);
-	if (err)
+	if (err) {
+		ntfs_err(sb, "Failed to initialize $Bitmap (%d).", err);
 		goto put_inode_out;
+	}
 
 	iput(inode);
 
 	/* Compute the MFT zone. */
 	err = ntfs_refresh_zone(sbi);
-	if (err)
+	if (err) {
+		ntfs_err(sb, "Failed to initialize MFT zone (%d).", err);
 		goto out;
+	}
 
 	/* Load $BadClus. */
 	ref.low = cpu_to_le32(MFT_REC_BADCLUST);
@@ -1180,15 +1198,23 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	ref.seq = cpu_to_le16(MFT_REC_ATTR);
 	inode = ntfs_iget5(sb, &ref, &NAME_ATTRDEF);
 	if (IS_ERR(inode)) {
-		ntfs_err(sb, "Failed to load $AttrDef -> %d", err);
 		err = PTR_ERR(inode);
+		ntfs_err(sb, "Failed to load $AttrDef (%d)", err);
 		goto out;
 	}
 
-	if (inode->i_size < sizeof(struct ATTR_DEF_ENTRY)) {
+	/*
+	 * Typical $AttrDef contains up to 20 entries.
+	 * Check for extremely large/small size.
+	 */
+	if (inode->i_size < sizeof(struct ATTR_DEF_ENTRY) ||
+	    inode->i_size > 100 * sizeof(struct ATTR_DEF_ENTRY)) {
+		ntfs_err(sb, "Looks like $AttrDef is corrupted (size=%llu).",
+			 inode->i_size);
 		err = -EINVAL;
 		goto put_inode_out;
 	}
+
 	bytes = inode->i_size;
 	sbi->def_table = t = kmalloc(bytes, GFP_NOFS | __GFP_NOWARN);
 	if (!t) {
@@ -1202,6 +1228,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
 
 		if (IS_ERR(page)) {
 			err = PTR_ERR(page);
+			ntfs_err(sb, "Failed to read $AttrDef (%d).", err);
 			goto put_inode_out;
 		}
 		memcpy(Add2Ptr(t, done), page_address(page),
@@ -1209,6 +1236,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
 		ntfs_unmap_page(page);
 
 		if (!idx && ATTR_STD != t->type) {
+			ntfs_err(sb, "$AttrDef is corrupted.");
 			err = -EINVAL;
 			goto put_inode_out;
 		}
@@ -1243,13 +1271,14 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	ref.seq = cpu_to_le16(MFT_REC_UPCASE);
 	inode = ntfs_iget5(sb, &ref, &NAME_UPCASE);
 	if (IS_ERR(inode)) {
-		ntfs_err(sb, "Failed to load $UpCase.");
 		err = PTR_ERR(inode);
+		ntfs_err(sb, "Failed to load $UpCase (%d).", err);
 		goto out;
 	}
 
 	if (inode->i_size != 0x10000 * sizeof(short)) {
 		err = -EINVAL;
+		ntfs_err(sb, "$UpCase is corrupted.");
 		goto put_inode_out;
 	}
 
@@ -1260,6 +1289,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
 
 		if (IS_ERR(page)) {
 			err = PTR_ERR(page);
+			ntfs_err(sb, "Failed to read $UpCase (%d).", err);
 			goto put_inode_out;
 		}
 
@@ -1285,23 +1315,31 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	if (is_ntfs3(sbi)) {
 		/* Load $Secure. */
 		err = ntfs_security_init(sbi);
-		if (err)
+		if (err) {
+			ntfs_err(sb, "Failed to initialize $Secure (%d).", err);
 			goto out;
+		}
 
 		/* Load $Extend. */
 		err = ntfs_extend_init(sbi);
-		if (err)
+		if (err) {
+			ntfs_warn(sb, "Failed to initialize $Extend.");
 			goto load_root;
+		}
 
-		/* Load $Extend\$Reparse. */
+		/* Load $Extend/$Reparse. */
 		err = ntfs_reparse_init(sbi);
-		if (err)
+		if (err) {
+			ntfs_warn(sb, "Failed to initialize $Extend/$Reparse.");
 			goto load_root;
+		}
 
-		/* Load $Extend\$ObjId. */
+		/* Load $Extend/$ObjId. */
 		err = ntfs_objid_init(sbi);
-		if (err)
+		if (err) {
+			ntfs_warn(sb, "Failed to initialize $Extend/$ObjId.");
 			goto load_root;
+		}
 	}
 
 load_root:
@@ -1309,12 +1347,21 @@ load_root:
 	ref.low = cpu_to_le32(MFT_REC_ROOT);
 	ref.seq = cpu_to_le16(MFT_REC_ROOT);
 	inode = ntfs_iget5(sb, &ref, &NAME_ROOT);
-	if (IS_ERR(inode) || !inode->i_op) {
-		ntfs_err(sb, "Failed to load root.");
-		err = IS_ERR(inode) ? PTR_ERR(inode) : -EINVAL;
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		ntfs_err(sb, "Failed to load root (%d).", err);
 		goto out;
 	}
 
+	/*
+	 * Final check. Looks like this case should never occurs.
+	 */
+	if (!inode->i_op) {
+		err = -EINVAL;
+		ntfs_err(sb, "Failed to load root (%d).", err);
+		goto put_inode_out;
+	}
+
 	sb->s_root = d_make_root(inode);
 	if (!sb->s_root) {
 		err = -ENOMEM;
@@ -1434,7 +1481,7 @@ static const struct fs_context_operations ntfs_context_ops = {
 };
 
 /*
- * ntfs_init_fs_context - Initialize spi and opts
+ * ntfs_init_fs_context - Initialize sbi and opts
  *
  * This will called when mount/remount. We will first initialize
  * options so that if remount we can use just that.
@@ -1507,7 +1554,8 @@ static int __init init_ntfs_fs(void)
 	if (IS_ENABLED(CONFIG_NTFS3_FS_POSIX_ACL))
 		pr_info("ntfs3: Enabled Linux POSIX ACLs support\n");
 	if (IS_ENABLED(CONFIG_NTFS3_64BIT_CLUSTER))
-		pr_notice("ntfs3: Warning: Activated 64 bits per cluster. Windows does not support this\n");
+		pr_notice(
+			"ntfs3: Warning: Activated 64 bits per cluster. Windows does not support this\n");
 	if (IS_ENABLED(CONFIG_NTFS3_LZX_XPRESS))
 		pr_info("ntfs3: Read-only LZX/Xpress compression included\n");
 
@@ -1550,7 +1598,9 @@ MODULE_DESCRIPTION("ntfs3 read/write filesystem");
 MODULE_INFO(behaviour, "Enabled Linux POSIX ACLs support");
 #endif
 #ifdef CONFIG_NTFS3_64BIT_CLUSTER
-MODULE_INFO(cluster, "Warning: Activated 64 bits per cluster. Windows does not support this");
+MODULE_INFO(
+	cluster,
+	"Warning: Activated 64 bits per cluster. Windows does not support this");
 #endif
 #ifdef CONFIG_NTFS3_LZX_XPRESS
 MODULE_INFO(compression, "Read-only lzx/xpress compression included");
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index 7324cf924932..c3de60a4543f 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -23,8 +23,8 @@
 
 static inline size_t unpacked_ea_size(const struct EA_FULL *ea)
 {
-	return ea->size ? le32_to_cpu(ea->size)
-			: ALIGN(struct_size(ea, name,
+	return ea->size ? le32_to_cpu(ea->size) :
+				ALIGN(struct_size(ea, name,
 					    1 + ea->name_len +
 						    le16_to_cpu(ea->elength)),
 				4);
@@ -296,7 +296,8 @@ out:
 
 static noinline int ntfs_set_ea(struct inode *inode, const char *name,
 				size_t name_len, const void *value,
-				size_t val_size, int flags, bool locked)
+				size_t val_size, int flags, bool locked,
+				__le16 *ea_size)
 {
 	struct ntfs_inode *ni = ntfs_i(inode);
 	struct ntfs_sb_info *sbi = ni->mi.sbi;
@@ -410,7 +411,7 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name,
 
 	/*
 	 * 1. Check ea_info.size_pack for overflow.
-	 * 2. New attibute size must fit value from $AttrDef
+	 * 2. New attribute size must fit value from $AttrDef
 	 */
 	if (new_pack > 0xffff || size > sbi->ea_max_size) {
 		ntfs_inode_warn(
@@ -504,6 +505,8 @@ update_ea:
 
 	if (ea_info.size_pack != size_pack)
 		ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
+	if (ea_size)
+		*ea_size = ea_info.size_pack;
 	mark_inode_dirty(&ni->vfs_inode);
 
 out:
@@ -517,9 +520,14 @@ out:
 }
 
 #ifdef CONFIG_NTFS3_FS_POSIX_ACL
-static struct posix_acl *ntfs_get_acl_ex(struct inode *inode, int type,
-					 int locked)
+
+/*
+ * ntfs_get_acl - inode_operations::get_acl
+ */
+struct posix_acl *ntfs_get_acl(struct mnt_idmap *idmap,
+			       struct dentry *dentry, int type)
 {
+	struct inode *inode = d_inode(dentry);
 	struct ntfs_inode *ni = ntfs_i(inode);
 	const char *name;
 	size_t name_len;
@@ -542,13 +550,11 @@ static struct posix_acl *ntfs_get_acl_ex(struct inode *inode, int type,
 		name_len = sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1;
 	}
 
-	if (!locked)
-		ni_lock(ni);
+	ni_lock(ni);
 
 	err = ntfs_get_ea(inode, name, name_len, buf, PATH_MAX, &req);
 
-	if (!locked)
-		ni_unlock(ni);
+	ni_unlock(ni);
 
 	/* Translate extended attribute to acl. */
 	if (err >= 0) {
@@ -567,17 +573,6 @@ static struct posix_acl *ntfs_get_acl_ex(struct inode *inode, int type,
 	return acl;
 }
 
-/*
- * ntfs_get_acl - inode_operations::get_acl
- */
-struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu)
-{
-	if (rcu)
-		return ERR_PTR(-ECHILD);
-
-	return ntfs_get_acl_ex(inode, type, 0);
-}
-
 static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap,
 				    struct inode *inode, struct posix_acl *acl,
 				    int type, bool init_acl)
@@ -633,7 +628,7 @@ static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap,
 		flags = 0;
 	}
 
-	err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0);
+	err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0, NULL);
 	if (err == -ENODATA && !size)
 		err = 0; /* Removing non existed xattr. */
 	if (!err) {
@@ -712,20 +707,6 @@ int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry)
 }
 
 /*
- * ntfs_permission - inode_operations::permission
- */
-int ntfs_permission(struct mnt_idmap *idmap, struct inode *inode,
-		    int mask)
-{
-	if (ntfs_sb(inode->i_sb)->options->noacsrules) {
-		/* "No access rules" mode - Allow all changes. */
-		return 0;
-	}
-
-	return generic_permission(idmap, inode, mask);
-}
-
-/*
  * ntfs_listxattr - inode_operations::listxattr
  */
 ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
@@ -780,7 +761,7 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de,
 			err = sizeof(u32);
 			*(u32 *)buffer = le32_to_cpu(ni->std_fa);
 			if (!strcmp(name, SYSTEM_NTFS_ATTRIB_BE))
-				*(u32 *)buffer = cpu_to_be32(*(u32 *)buffer);
+				*(__be32 *)buffer = cpu_to_be32(*(u32 *)buffer);
 		}
 		goto out;
 	}
@@ -857,7 +838,7 @@ static noinline int ntfs_setxattr(const struct xattr_handler *handler,
 		if (size != sizeof(u32))
 			goto out;
 		if (!strcmp(name, SYSTEM_NTFS_ATTRIB_BE))
-			new_fa = cpu_to_le32(be32_to_cpu(*(u32 *)value));
+			new_fa = cpu_to_le32(be32_to_cpu(*(__be32 *)value));
 		else
 			new_fa = cpu_to_le32(*(u32 *)value);
 
@@ -937,7 +918,8 @@ set_new_fa:
 	}
 
 	/* Deal with NTFS extended attribute. */
-	err = ntfs_set_ea(inode, name, strlen(name), value, size, flags, 0);
+	err = ntfs_set_ea(inode, name, strlen(name), value, size, flags, 0,
+			  NULL);
 
 out:
 	inode->i_ctime = current_time(inode);
@@ -951,7 +933,7 @@ out:
  *
  * save uid/gid/mode in xattr
  */
-int ntfs_save_wsl_perm(struct inode *inode)
+int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size)
 {
 	int err;
 	__le32 value;
@@ -960,26 +942,26 @@ int ntfs_save_wsl_perm(struct inode *inode)
 	ni_lock(ni);
 	value = cpu_to_le32(i_uid_read(inode));
 	err = ntfs_set_ea(inode, "$LXUID", sizeof("$LXUID") - 1, &value,
-			  sizeof(value), 0, true); /* true == already locked. */
+			  sizeof(value), 0, true, ea_size);
 	if (err)
 		goto out;
 
 	value = cpu_to_le32(i_gid_read(inode));
 	err = ntfs_set_ea(inode, "$LXGID", sizeof("$LXGID") - 1, &value,
-			  sizeof(value), 0, true);
+			  sizeof(value), 0, true, ea_size);
 	if (err)
 		goto out;
 
 	value = cpu_to_le32(inode->i_mode);
 	err = ntfs_set_ea(inode, "$LXMOD", sizeof("$LXMOD") - 1, &value,
-			  sizeof(value), 0, true);
+			  sizeof(value), 0, true, ea_size);
 	if (err)
 		goto out;
 
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
 		value = cpu_to_le32(inode->i_rdev);
 		err = ntfs_set_ea(inode, "$LXDEV", sizeof("$LXDEV") - 1, &value,
-				  sizeof(value), 0, true);
+				  sizeof(value), 0, true, ea_size);
 		if (err)
 			goto out;
 	}
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 811a6ea374bb..b1550ba73f96 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -803,8 +803,8 @@ bail:
  * a better backward&forward compatibility, since a small piece of
  * request will be less likely to be broken if disk layout get changed.
  */
-static int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
-			     int compat_flag)
+static noinline_for_stack int
+ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, int compat_flag)
 {
 	int i, status = 0;
 	u64 req_addr;
@@ -840,27 +840,26 @@ bail:
 long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
-	int new_clusters;
-	int status;
-	struct ocfs2_space_resv sr;
-	struct ocfs2_new_group_input input;
-	struct reflink_arguments args;
-	const char __user *old_path;
-	const char __user *new_path;
-	bool preserve;
-	struct ocfs2_info info;
 	void __user *argp = (void __user *)arg;
+	int status;
 
 	switch (cmd) {
 	case OCFS2_IOC_RESVSP:
 	case OCFS2_IOC_RESVSP64:
 	case OCFS2_IOC_UNRESVSP:
 	case OCFS2_IOC_UNRESVSP64:
+	{
+		struct ocfs2_space_resv sr;
+
 		if (copy_from_user(&sr, (int __user *) arg, sizeof(sr)))
 			return -EFAULT;
 
 		return ocfs2_change_file_space(filp, cmd, &sr);
+	}
 	case OCFS2_IOC_GROUP_EXTEND:
+	{
+		int new_clusters;
+
 		if (!capable(CAP_SYS_RESOURCE))
 			return -EPERM;
 
@@ -873,8 +872,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		status = ocfs2_group_extend(inode, new_clusters);
 		mnt_drop_write_file(filp);
 		return status;
+	}
 	case OCFS2_IOC_GROUP_ADD:
 	case OCFS2_IOC_GROUP_ADD64:
+	{
+		struct ocfs2_new_group_input input;
+
 		if (!capable(CAP_SYS_RESOURCE))
 			return -EPERM;
 
@@ -887,7 +890,14 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		status = ocfs2_group_add(inode, &input);
 		mnt_drop_write_file(filp);
 		return status;
+	}
 	case OCFS2_IOC_REFLINK:
+	{
+		struct reflink_arguments args;
+		const char __user *old_path;
+		const char __user *new_path;
+		bool preserve;
+
 		if (copy_from_user(&args, argp, sizeof(args)))
 			return -EFAULT;
 		old_path = (const char __user *)(unsigned long)args.old_path;
@@ -895,11 +905,16 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		preserve = (args.preserve != 0);
 
 		return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
+	}
 	case OCFS2_IOC_INFO:
+	{
+		struct ocfs2_info info;
+
 		if (copy_from_user(&info, argp, sizeof(struct ocfs2_info)))
 			return -EFAULT;
 
 		return ocfs2_info_handle(inode, &info, 0);
+	}
 	case FITRIM:
 	{
 		struct super_block *sb = inode->i_sb;
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index aefdf1d3be7c..9014bbcc8031 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -244,7 +244,7 @@ static void orangefs_readahead(struct readahead_control *rac)
 	struct iov_iter iter;
 	struct inode *inode = rac->mapping->host;
 	struct xarray *i_pages;
-	struct page *page;
+	struct folio *folio;
 	loff_t new_start = readahead_pos(rac);
 	int ret;
 	size_t new_len = 0;
@@ -275,9 +275,10 @@ static void orangefs_readahead(struct readahead_control *rac)
 		ret = 0;
 
 	/* clean up. */
-	while ((page = readahead_page(rac))) {
-		page_endio(page, false, ret);
-		put_page(page);
+	while ((folio = readahead_folio(rac))) {
+		if (!ret)
+			folio_mark_uptodate(folio);
+		folio_unlock(folio);
 	}
 }
 
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 9b0315d34c58..d35bbf35a874 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -91,6 +91,7 @@
 #include <linux/user_namespace.h>
 #include <linux/fs_struct.h>
 #include <linux/kthread.h>
+#include <linux/mmu_context.h>
 
 #include <asm/processor.h>
 #include "internal.h"
@@ -219,6 +220,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 		seq_put_decimal_ull(m, "\t", task_session_nr_ns(p, pid->numbers[g].ns));
 #endif
 	seq_putc(m, '\n');
+
+	seq_printf(m, "Kthread:\t%c\n", p->flags & PF_KTHREAD ? '1' : '0');
 }
 
 void render_sigset_t(struct seq_file *m, const char *header,
@@ -423,6 +426,11 @@ static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm)
 	seq_printf(m, "THP_enabled:\t%d\n", thp_enabled);
 }
 
+static inline void task_untag_mask(struct seq_file *m, struct mm_struct *mm)
+{
+	seq_printf(m, "untag_mask:\t%#lx\n", mm_untag_mask(mm));
+}
+
 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 			struct pid *pid, struct task_struct *task)
 {
@@ -438,6 +446,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 		task_mem(m, mm);
 		task_core_dumping(m, task);
 		task_thp_status(m, mm);
+		task_untag_mask(m, mm);
 		mmput(mm);
 	}
 	task_sig(m, task);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5e0e0ccd47aa..05452c3b9872 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -96,6 +96,7 @@
 #include <linux/time_namespace.h>
 #include <linux/resctrl.h>
 #include <linux/cn_proc.h>
+#include <linux/ksm.h>
 #include <trace/events/oom.h>
 #include "internal.h"
 #include "fd.h"
@@ -699,7 +700,6 @@ int proc_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 		return error;
 
 	setattr_copy(&nop_mnt_idmap, inode, attr);
-	mark_inode_dirty(inode);
 	return 0;
 }
 
@@ -3207,6 +3207,8 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
 	mm = get_task_mm(task);
 	if (mm) {
 		seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items);
+		seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages);
+		seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm));
 		mmput(mm);
 	}
 
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 8379593fa4bb..42ae38ff6e7e 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -127,7 +127,6 @@ static int proc_notify_change(struct mnt_idmap *idmap,
 		return error;
 
 	setattr_copy(&nop_mnt_idmap, inode, iattr);
-	mark_inode_dirty(inode);
 
 	proc_set_user(de, inode->i_uid, inode->i_gid);
 	de->mode = inode->i_mode;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 71157ee35c1a..25b44b303b35 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -24,7 +24,7 @@
 #include <linux/memblock.h>
 #include <linux/init.h>
 #include <linux/slab.h>
-#include <linux/uaccess.h>
+#include <linux/uio.h>
 #include <asm/io.h>
 #include <linux/list.h>
 #include <linux/ioport.h>
@@ -307,10 +307,9 @@ static void append_kcore_note(char *notes, size_t *i, const char *name,
 	*i = ALIGN(*i + descsz, 4);
 }
 
-static ssize_t
-read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
+static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
 {
-	char *buf = file->private_data;
+	loff_t *fpos = &iocb->ki_pos;
 	size_t phdrs_offset, notes_offset, data_offset;
 	size_t page_offline_frozen = 1;
 	size_t phdrs_len, notes_len;
@@ -318,6 +317,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 	size_t tsz;
 	int nphdr;
 	unsigned long start;
+	size_t buflen = iov_iter_count(iter);
 	size_t orig_buflen = buflen;
 	int ret = 0;
 
@@ -356,12 +356,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 		};
 
 		tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos);
-		if (copy_to_user(buffer, (char *)&ehdr + *fpos, tsz)) {
+		if (copy_to_iter((char *)&ehdr + *fpos, tsz, iter) != tsz) {
 			ret = -EFAULT;
 			goto out;
 		}
 
-		buffer += tsz;
 		buflen -= tsz;
 		*fpos += tsz;
 	}
@@ -398,15 +397,14 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 		}
 
 		tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos);
-		if (copy_to_user(buffer, (char *)phdrs + *fpos - phdrs_offset,
-				 tsz)) {
+		if (copy_to_iter((char *)phdrs + *fpos - phdrs_offset, tsz,
+				 iter) != tsz) {
 			kfree(phdrs);
 			ret = -EFAULT;
 			goto out;
 		}
 		kfree(phdrs);
 
-		buffer += tsz;
 		buflen -= tsz;
 		*fpos += tsz;
 	}
@@ -448,14 +446,13 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 				  min(vmcoreinfo_size, notes_len - i));
 
 		tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos);
-		if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) {
+		if (copy_to_iter(notes + *fpos - notes_offset, tsz, iter) != tsz) {
 			kfree(notes);
 			ret = -EFAULT;
 			goto out;
 		}
 		kfree(notes);
 
-		buffer += tsz;
 		buflen -= tsz;
 		*fpos += tsz;
 	}
@@ -497,7 +494,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 		}
 
 		if (!m) {
-			if (clear_user(buffer, tsz)) {
+			if (iov_iter_zero(tsz, iter) != tsz) {
 				ret = -EFAULT;
 				goto out;
 			}
@@ -506,16 +503,33 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 
 		switch (m->type) {
 		case KCORE_VMALLOC:
-			vread(buf, (char *)start, tsz);
-			/* we have to zero-fill user buffer even if no read */
-			if (copy_to_user(buffer, buf, tsz)) {
-				ret = -EFAULT;
-				goto out;
+		{
+			const char *src = (char *)start;
+			size_t read = 0, left = tsz;
+
+			/*
+			 * vmalloc uses spinlocks, so we optimistically try to
+			 * read memory. If this fails, fault pages in and try
+			 * again until we are done.
+			 */
+			while (true) {
+				read += vread_iter(iter, src, left);
+				if (read == tsz)
+					break;
+
+				src += read;
+				left -= read;
+
+				if (fault_in_iov_iter_writeable(iter, left)) {
+					ret = -EFAULT;
+					goto out;
+				}
 			}
 			break;
+		}
 		case KCORE_USER:
 			/* User page is handled prior to normal kernel page: */
-			if (copy_to_user(buffer, (char *)start, tsz)) {
+			if (copy_to_iter((char *)start, tsz, iter) != tsz) {
 				ret = -EFAULT;
 				goto out;
 			}
@@ -531,7 +545,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 			 */
 			if (!page || PageOffline(page) ||
 			    is_page_hwpoison(page) || !pfn_is_ram(pfn)) {
-				if (clear_user(buffer, tsz)) {
+				if (iov_iter_zero(tsz, iter) != tsz) {
 					ret = -EFAULT;
 					goto out;
 				}
@@ -541,24 +555,17 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 		case KCORE_VMEMMAP:
 		case KCORE_TEXT:
 			/*
-			 * Using bounce buffer to bypass the
-			 * hardened user copy kernel text checks.
+			 * We use _copy_to_iter() to bypass usermode hardening
+			 * which would otherwise prevent this operation.
 			 */
-			if (copy_from_kernel_nofault(buf, (void *)start, tsz)) {
-				if (clear_user(buffer, tsz)) {
-					ret = -EFAULT;
-					goto out;
-				}
-			} else {
-				if (copy_to_user(buffer, buf, tsz)) {
-					ret = -EFAULT;
-					goto out;
-				}
+			if (_copy_to_iter((char *)start, tsz, iter) != tsz) {
+				ret = -EFAULT;
+				goto out;
 			}
 			break;
 		default:
 			pr_warn_once("Unhandled KCORE type: %d\n", m->type);
-			if (clear_user(buffer, tsz)) {
+			if (iov_iter_zero(tsz, iter) != tsz) {
 				ret = -EFAULT;
 				goto out;
 			}
@@ -566,7 +573,6 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 skip:
 		buflen -= tsz;
 		*fpos += tsz;
-		buffer += tsz;
 		start += tsz;
 		tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen);
 	}
@@ -589,10 +595,6 @@ static int open_kcore(struct inode *inode, struct file *filp)
 	if (ret)
 		return ret;
 
-	filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!filp->private_data)
-		return -ENOMEM;
-
 	if (kcore_need_update)
 		kcore_update_ram();
 	if (i_size_read(inode) != proc_root_kcore->size) {
@@ -603,16 +605,9 @@ static int open_kcore(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static int release_kcore(struct inode *inode, struct file *file)
-{
-	kfree(file->private_data);
-	return 0;
-}
-
 static const struct proc_ops kcore_proc_ops = {
-	.proc_read	= read_kcore,
+	.proc_read_iter	= read_kcore_iter,
 	.proc_open	= open_kcore,
-	.proc_release	= release_kcore,
 	.proc_lseek	= default_llseek,
 };
 
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 440960110a42..b43d0bd42762 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -6,6 +6,7 @@
 #include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <linux/mmzone.h>
+#include <linux/memblock.h>
 #include <linux/proc_fs.h>
 #include <linux/percpu.h>
 #include <linux/seq_file.h>
@@ -131,6 +132,18 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	show_val_kb(m, "VmallocChunk:   ", 0ul);
 	show_val_kb(m, "Percpu:         ", pcpu_nr_pages());
 
+#ifdef CONFIG_MEMTEST
+	if (early_memtest_done) {
+		unsigned long early_memtest_bad_size_kb;
+
+		early_memtest_bad_size_kb = early_memtest_bad_size>>10;
+		if (early_memtest_bad_size && !early_memtest_bad_size_kb)
+			early_memtest_bad_size_kb = 1;
+		/* When 0 is reported, it means there actually was a successful test */
+		seq_printf(m, "EarlyMemtestBad:   %5lu kB\n", early_memtest_bad_size_kb);
+	}
+#endif
+
 #ifdef CONFIG_MEMORY_FAILURE
 	seq_printf(m, "HardwareCorrupted: %5lu kB\n",
 		   atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10));
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5851eb5bc726..8038833ff5b0 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -841,7 +841,6 @@ static int proc_sys_setattr(struct mnt_idmap *idmap,
 		return error;
 
 	setattr_copy(&nop_mnt_idmap, inode, attr);
-	mark_inode_dirty(inode);
 	return 0;
 }
 
@@ -1283,11 +1282,43 @@ out:
 	return err;
 }
 
+/* Find the directory for the ctl_table. If one is not found create it. */
+static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path)
+{
+	const char *name, *nextname;
+
+	for (name = path; name; name = nextname) {
+		int namelen;
+		nextname = strchr(name, '/');
+		if (nextname) {
+			namelen = nextname - name;
+			nextname++;
+		} else {
+			namelen = strlen(name);
+		}
+		if (namelen == 0)
+			continue;
+
+		/*
+		 * namelen ensures if name is "foo/bar/yay" only foo is
+		 * registered first. We traverse as if using mkdir -p and
+		 * return a ctl_dir for the last directory entry.
+		 */
+		dir = get_subdir(dir, name, namelen);
+		if (IS_ERR(dir))
+			break;
+	}
+	return dir;
+}
+
 /**
  * __register_sysctl_table - register a leaf sysctl table
  * @set: Sysctl tree to register on
  * @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure
+ * @table: the top-level table structure without any child. This table
+ * 	 should not be free'd after registration. So it should not be
+ * 	 used on stack. It can either be a global or dynamically allocated
+ * 	 by the caller and free'd later after sysctl unregistration.
  *
  * Register a sysctl table hierarchy. @table should be a filled in ctl_table
  * array. A completely 0 filled entry terminates the table.
@@ -1308,9 +1339,12 @@ out:
  * proc_handler - the text handler routine (described below)
  *
  * extra1, extra2 - extra pointers usable by the proc handler routines
+ * XXX: we should eventually modify these to use long min / max [0]
+ * [0] https://lkml.kernel.org/87zgpte9o4.fsf@email.froward.int.ebiederm.org
  *
  * Leaf nodes in the sysctl tree will be represented by a single file
- * under /proc; non-leaf nodes will be represented by directories.
+ * under /proc; non-leaf nodes (where child is not NULL) are not allowed,
+ * sysctl_check_table() verifies this.
  *
  * There must be a proc_handler routine for any terminal nodes.
  * Several default handlers are available to cover common cases -
@@ -1331,7 +1365,6 @@ struct ctl_table_header *__register_sysctl_table(
 {
 	struct ctl_table_root *root = set->dir.header.root;
 	struct ctl_table_header *header;
-	const char *name, *nextname;
 	struct ctl_dir *dir;
 	struct ctl_table *entry;
 	struct ctl_node *node;
@@ -1352,28 +1385,13 @@ struct ctl_table_header *__register_sysctl_table(
 
 	spin_lock(&sysctl_lock);
 	dir = &set->dir;
-	/* Reference moved down the diretory tree get_subdir */
+	/* Reference moved down the directory tree get_subdir */
 	dir->header.nreg++;
 	spin_unlock(&sysctl_lock);
 
-	/* Find the directory for the ctl_table */
-	for (name = path; name; name = nextname) {
-		int namelen;
-		nextname = strchr(name, '/');
-		if (nextname) {
-			namelen = nextname - name;
-			nextname++;
-		} else {
-			namelen = strlen(name);
-		}
-		if (namelen == 0)
-			continue;
-
-		dir = get_subdir(dir, name, namelen);
-		if (IS_ERR(dir))
-			goto fail;
-	}
-
+	dir = sysctl_mkdir_p(dir, path);
+	if (IS_ERR(dir))
+		goto fail;
 	spin_lock(&sysctl_lock);
 	if (insert_header(dir, header))
 		goto fail_put_dir_locked;
@@ -1394,8 +1412,15 @@ fail:
 
 /**
  * register_sysctl - register a sysctl table
- * @path: The path to the directory the sysctl table is in.
- * @table: the table structure
+ * @path: The path to the directory the sysctl table is in. If the path
+ * 	doesn't exist we will create it for you.
+ * @table: the table structure. The calller must ensure the life of the @table
+ * 	will be kept during the lifetime use of the syctl. It must not be freed
+ * 	until unregister_sysctl_table() is called with the given returned table
+ * 	with this registration. If your code is non modular then you don't need
+ * 	to call unregister_sysctl_table() and can instead use something like
+ * 	register_sysctl_init() which does not care for the result of the syctl
+ * 	registration.
  *
  * Register a sysctl table. @table should be a filled in ctl_table
  * array. A completely 0 filled entry terminates the table.
@@ -1411,8 +1436,11 @@ EXPORT_SYMBOL(register_sysctl);
 
 /**
  * __register_sysctl_init() - register sysctl table to path
- * @path: path name for sysctl base
- * @table: This is the sysctl table that needs to be registered to the path
+ * @path: path name for sysctl base. If that path doesn't exist we will create
+ * 	it for you.
+ * @table: This is the sysctl table that needs to be registered to the path.
+ * 	The caller must ensure the life of the @table will be kept during the
+ * 	lifetime use of the sysctl.
  * @table_name: The name of sysctl table, only used for log printing when
  *              registration fails
  *
@@ -1424,10 +1452,7 @@ EXPORT_SYMBOL(register_sysctl);
  * register_sysctl() failing on init are extremely low, and so for both reasons
  * this function does not return any error as it is used by initialization code.
  *
- * Context: Can only be called after your respective sysctl base path has been
- * registered. So for instance, most base directories are registered early on
- * init before init levels are processed through proc_sys_init() and
- * sysctl_init_bases().
+ * Context: if your base directory does not exist it will be created for you.
  */
 void __init __register_sysctl_init(const char *path, struct ctl_table *table,
 				 const char *table_name)
@@ -1550,24 +1575,18 @@ out:
 }
 
 /**
- * __register_sysctl_paths - register a sysctl table hierarchy
- * @set: Sysctl tree to register on
- * @path: The path to the directory the sysctl table is in.
+ * register_sysctl_table - register a sysctl table hierarchy
  * @table: the top-level table structure
  *
  * Register a sysctl table hierarchy. @table should be a filled in ctl_table
  * array. A completely 0 filled entry terminates the table.
- *
- * See __register_sysctl_table for more details.
+ * We are slowly deprecating this call so avoid its use.
  */
-struct ctl_table_header *__register_sysctl_paths(
-	struct ctl_table_set *set,
-	const struct ctl_path *path, struct ctl_table *table)
+struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
 {
 	struct ctl_table *ctl_table_arg = table;
 	int nr_subheaders = count_subheaders(table);
 	struct ctl_table_header *header = NULL, **subheaders, **subheader;
-	const struct ctl_path *component;
 	char *new_path, *pos;
 
 	pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL);
@@ -1575,11 +1594,6 @@ struct ctl_table_header *__register_sysctl_paths(
 		return NULL;
 
 	pos[0] = '\0';
-	for (component = path; component->procname; component++) {
-		pos = append_path(new_path, pos, component->procname);
-		if (!pos)
-			goto out;
-	}
 	while (table->procname && table->child && !table[1].procname) {
 		pos = append_path(new_path, pos, table->procname);
 		if (!pos)
@@ -1587,7 +1601,7 @@ struct ctl_table_header *__register_sysctl_paths(
 		table = table->child;
 	}
 	if (nr_subheaders == 1) {
-		header = __register_sysctl_table(set, new_path, table);
+		header = __register_sysctl_table(&sysctl_table_root.default_set, new_path, table);
 		if (header)
 			header->ctl_table_arg = ctl_table_arg;
 	} else {
@@ -1601,7 +1615,7 @@ struct ctl_table_header *__register_sysctl_paths(
 		header->ctl_table_arg = ctl_table_arg;
 
 		if (register_leaf_sysctl_tables(new_path, pos, &subheader,
-						set, table))
+						&sysctl_table_root.default_set, table))
 			goto err_register_leaves;
 	}
 
@@ -1620,40 +1634,6 @@ err_register_leaves:
 	header = NULL;
 	goto out;
 }
-
-/**
- * register_sysctl_paths - register a sysctl table hierarchy
- * @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * See __register_sysctl_paths for more details.
- */
-struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
-						struct ctl_table *table)
-{
-	return __register_sysctl_paths(&sysctl_table_root.default_set,
-					path, table);
-}
-EXPORT_SYMBOL(register_sysctl_paths);
-
-/**
- * register_sysctl_table - register a sysctl table hierarchy
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * See register_sysctl_paths for more details.
- */
-struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
-{
-	static const struct ctl_path null_path[] = { {} };
-
-	return register_sysctl_paths(null_path, table);
-}
 EXPORT_SYMBOL(register_sysctl_table);
 
 int __register_sysctl_base(struct ctl_table *base_table)
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 4fb8729a68d4..da60956b2915 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -22,30 +22,6 @@
 #define arch_irq_stat() 0
 #endif
 
-#ifdef arch_idle_time
-
-u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
-{
-	u64 idle;
-
-	idle = kcs->cpustat[CPUTIME_IDLE];
-	if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
-		idle += arch_idle_time(cpu);
-	return idle;
-}
-
-static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
-{
-	u64 iowait;
-
-	iowait = kcs->cpustat[CPUTIME_IOWAIT];
-	if (cpu_online(cpu) && nr_iowait_cpu(cpu))
-		iowait += arch_idle_time(cpu);
-	return iowait;
-}
-
-#else
-
 u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
 {
 	u64 idle, idle_usecs = -1ULL;
@@ -78,8 +54,6 @@ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
 	return iowait;
 }
 
-#endif
-
 static void show_irq_gap(struct seq_file *p, unsigned int gap)
 {
 	static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0";
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6a96e1713fd5..420510f6a545 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -782,7 +782,6 @@ static void smap_gather_stats(struct vm_area_struct *vma,
 	if (start >= vma->vm_end)
 		return;
 
-#ifdef CONFIG_SHMEM
 	if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
 		/*
 		 * For shared or readonly shmem mappings we know that all
@@ -803,7 +802,7 @@ static void smap_gather_stats(struct vm_area_struct *vma,
 			ops = &smaps_shmem_walk_ops;
 		}
 	}
-#endif
+
 	/* mmap_lock is held in m_start */
 	if (!start)
 		walk_page_vma(vma, ops, mss);
@@ -1689,8 +1688,13 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 
 	/* watch out for wraparound */
 	start_vaddr = end_vaddr;
-	if (svpfn <= (ULONG_MAX >> PAGE_SHIFT))
-		start_vaddr = untagged_addr(svpfn << PAGE_SHIFT);
+	if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) {
+		ret = mmap_read_lock_killable(mm);
+		if (ret)
+			goto out_free;
+		start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT);
+		mmap_read_unlock(mm);
+	}
 
 	/* Ensure the address is inside the task */
 	if (start_vaddr > mm->task_size)
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 12af614f33ce..03f5963914a1 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -339,7 +339,7 @@ static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos)
 			return acc;
 	}
 
-	/* Read Elf note segment */
+	/* Read ELF note segment */
 	if (*fpos < elfcorebuf_sz + elfnotes_sz) {
 		void *kaddr;
 
@@ -1109,7 +1109,7 @@ static int __init process_ptload_program_headers_elf64(char *elfptr,
 	ehdr_ptr = (Elf64_Ehdr *)elfptr;
 	phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */
 
-	/* Skip Elf header, program headers and Elf note segment. */
+	/* Skip ELF header, program headers and ELF note segment. */
 	vmcore_off = elfsz + elfnotes_sz;
 
 	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
@@ -1152,7 +1152,7 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
 	ehdr_ptr = (Elf32_Ehdr *)elfptr;
 	phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */
 
-	/* Skip Elf header, program headers and Elf note segment. */
+	/* Skip ELF header, program headers and ELF note segment. */
 	vmcore_off = elfsz + elfnotes_sz;
 
 	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
@@ -1188,7 +1188,7 @@ static void set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
 	loff_t vmcore_off;
 	struct vmcore *m;
 
-	/* Skip Elf header, program headers and Elf note segment. */
+	/* Skip ELF header, program headers and ELF note segment. */
 	vmcore_off = elfsz + elfnotes_sz;
 
 	list_for_each_entry(m, vc_list, list) {
@@ -1213,7 +1213,7 @@ static int __init parse_crash_elf64_headers(void)
 
 	addr = elfcorehdr_addr;
 
-	/* Read Elf header */
+	/* Read ELF header */
 	rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf64_Ehdr), &addr);
 	if (rc < 0)
 		return rc;
@@ -1269,7 +1269,7 @@ static int __init parse_crash_elf32_headers(void)
 
 	addr = elfcorehdr_addr;
 
-	/* Read Elf header */
+	/* Read ELF header */
 	rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf32_Ehdr), &addr);
 	if (rc < 0)
 		return rc;
@@ -1376,12 +1376,12 @@ static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data,
 }
 
 /**
- * vmcoredd_update_program_headers - Update all Elf program headers
+ * vmcoredd_update_program_headers - Update all ELF program headers
  * @elfptr: Pointer to elf header
  * @elfnotesz: Size of elf notes aligned to page size
  * @vmcoreddsz: Size of device dumps to be added to elf note header
  *
- * Determine type of Elf header (Elf64 or Elf32) and update the elf note size.
+ * Determine type of ELF header (Elf64 or Elf32) and update the elf note size.
  * Also update the offsets of all the program headers after the elf note header.
  */
 static void vmcoredd_update_program_headers(char *elfptr, size_t elfnotesz,
@@ -1439,10 +1439,10 @@ static void vmcoredd_update_program_headers(char *elfptr, size_t elfnotesz,
 
 /**
  * vmcoredd_update_size - Update the total size of the device dumps and update
- * Elf header
+ * ELF header
  * @dump_size: Size of the current device dump to be added to total size
  *
- * Update the total size of all the device dumps and update the Elf program
+ * Update the total size of all the device dumps and update the ELF program
  * headers. Calculate the new offsets for the vmcore list and update the
  * total vmcore size.
  */
@@ -1466,7 +1466,7 @@ static void vmcoredd_update_size(size_t dump_size)
  * @data: dump info.
  *
  * Allocate a buffer and invoke the calling driver's dump collect routine.
- * Write Elf note at the beginning of the buffer to indicate vmcore device
+ * Write ELF note at the beginning of the buffer to indicate vmcore device
  * dump and add the dump to global list.
  */
 int vmcore_add_device_dump(struct vmcoredd_data *data)
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
index ab82e5f05346..55f139afa327 100644
--- a/fs/pstore/pmsg.c
+++ b/fs/pstore/pmsg.c
@@ -7,10 +7,9 @@
 #include <linux/device.h>
 #include <linux/fs.h>
 #include <linux/uaccess.h>
-#include <linux/rtmutex.h>
 #include "internal.h"
 
-static DEFINE_RT_MUTEX(pmsg_lock);
+static DEFINE_MUTEX(pmsg_lock);
 
 static ssize_t write_pmsg(struct file *file, const char __user *buf,
 			  size_t count, loff_t *ppos)
@@ -29,9 +28,9 @@ static ssize_t write_pmsg(struct file *file, const char __user *buf,
 	if (!access_ok(buf, count))
 		return -EFAULT;
 
-	rt_mutex_lock(&pmsg_lock);
+	mutex_lock(&pmsg_lock);
 	ret = psinfo->write_user(&record, buf);
-	rt_mutex_unlock(&pmsg_lock);
+	mutex_unlock(&pmsg_lock);
 	return ret ? ret : count;
 }
 
@@ -64,7 +63,7 @@ void pstore_register_pmsg(void)
 		goto err;
 	}
 
-	pmsg_class = class_create(THIS_MODULE, PMSG_NAME);
+	pmsg_class = class_create(PMSG_NAME);
 	if (IS_ERR(pmsg_class)) {
 		pr_err("device class file already in use\n");
 		goto err_class;
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 2f67516bb9bf..9fbb9b5256f7 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -70,7 +70,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 
 	/* make various checks */
 	order = get_order(newsize);
-	if (unlikely(order >= MAX_ORDER))
+	if (unlikely(order > MAX_ORDER))
 		return -EFBIG;
 
 	ret = inode_newsize_ok(inode, newsize);
diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h
index ace133cf6072..3b43a51e6f7e 100644
--- a/fs/smbfs_common/smb2pdu.h
+++ b/fs/smbfs_common/smb2pdu.h
@@ -327,17 +327,18 @@ struct smb2_tree_connect_req {
 #define SMB2_SHAREFLAG_NO_CACHING			0x00000030
 #define SHI1005_FLAGS_DFS				0x00000001
 #define SHI1005_FLAGS_DFS_ROOT				0x00000002
-#define SHI1005_FLAGS_RESTRICT_EXCLUSIVE_OPENS		0x00000100
-#define SHI1005_FLAGS_FORCE_SHARED_DELETE		0x00000200
-#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING		0x00000400
-#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM	0x00000800
-#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK		0x00001000
-#define SHI1005_FLAGS_ENABLE_HASH_V1			0x00002000
-#define SHI1005_FLAGS_ENABLE_HASH_V2			0x00004000
+#define SMB2_SHAREFLAG_RESTRICT_EXCLUSIVE_OPENS		0x00000100
+#define SMB2_SHAREFLAG_FORCE_SHARED_DELETE		0x00000200
+#define SMB2_SHAREFLAG_ALLOW_NAMESPACE_CACHING		0x00000400
+#define SMB2_SHAREFLAG_ACCESS_BASED_DIRECTORY_ENUM	0x00000800
+#define SMB2_SHAREFLAG_FORCE_LEVELII_OPLOCK		0x00001000
+#define SMB2_SHAREFLAG_ENABLE_HASH_V1			0x00002000
+#define SMB2_SHAREFLAG_ENABLE_HASH_V2			0x00004000
 #define SHI1005_FLAGS_ENCRYPT_DATA			0x00008000
 #define SMB2_SHAREFLAG_IDENTITY_REMOTING		0x00040000 /* 3.1.1 */
 #define SMB2_SHAREFLAG_COMPRESS_DATA			0x00100000 /* 3.1.1 */
-#define SHI1005_FLAGS_ALL				0x0014FF33
+#define SMB2_SHAREFLAG_ISOLATED_TRANSPORT		0x00200000
+#define SHI1005_FLAGS_ALL				0x0034FF33
 
 /* Possible share capabilities */
 #define SMB2_SHARE_CAP_DFS	cpu_to_le32(0x00000008) /* all dialects */
@@ -1171,6 +1172,34 @@ struct create_posix {
 	__u32   Reserved;
 } __packed;
 
+/* See MS-SMB2 2.2.13.2.3 and MS-SMB2 2.2.13.2.4 */
+struct create_durable {
+	struct create_context ccontext;
+	__u8   Name[8];
+	union {
+		__u8  Reserved[16];
+		struct {
+			__u64 PersistentFileId;
+			__u64 VolatileFileId;
+		} Fid;
+	} Data;
+} __packed;
+
+/* See MS-SMB2 2.2.13.2.5 */
+struct create_mxac_req {
+	struct create_context ccontext;
+	__u8   Name[8];
+	__le64 Timestamp;
+} __packed;
+
+/* See MS-SMB2 2.2.14.2.5 */
+struct create_mxac_rsp {
+	struct create_context ccontext;
+	__u8   Name[8];
+	__le32 QueryStatus;
+	__le32 MaximalAccess;
+} __packed;
+
 #define SMB2_LEASE_NONE_LE			cpu_to_le32(0x00)
 #define SMB2_LEASE_READ_CACHING_LE		cpu_to_le32(0x01)
 #define SMB2_LEASE_HANDLE_CACHING_LE		cpu_to_le32(0x02)
@@ -1180,6 +1209,7 @@ struct create_posix {
 
 #define SMB2_LEASE_KEY_SIZE			16
 
+/* See MS-SMB2 2.2.13.2.8 */
 struct lease_context {
 	__u8 LeaseKey[SMB2_LEASE_KEY_SIZE];
 	__le32 LeaseState;
@@ -1187,6 +1217,7 @@ struct lease_context {
 	__le64 LeaseDuration;
 } __packed;
 
+/* See MS-SMB2 2.2.13.2.10 */
 struct lease_context_v2 {
 	__u8 LeaseKey[SMB2_LEASE_KEY_SIZE];
 	__le32 LeaseState;
@@ -1210,6 +1241,15 @@ struct create_lease_v2 {
 	__u8   Pad[4];
 } __packed;
 
+/* See MS-SMB2 2.2.14.2.9 */
+struct create_disk_id_rsp {
+	struct create_context ccontext;
+	__u8   Name[8];
+	__le64 DiskFileId;
+	__le64 VolumeId;
+	__u8  Reserved[16];
+} __packed;
+
 /* See MS-SMB2 2.2.31 and 2.2.32 */
 struct smb2_ioctl_req {
 	struct smb2_hdr hdr;
diff --git a/fs/super.c b/fs/super.c
index 04bc62ab7dfe..34afe411cf2b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -54,7 +54,7 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = {
  * One thing we have to be careful of with a per-sb shrinker is that we don't
  * drop the last active reference to the superblock from within the shrinker.
  * If that happens we could trigger unregistering the shrinker from within the
- * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
+ * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we
  * take a passive reference to the superblock to avoid this from occurring.
  */
 static unsigned long super_cache_scan(struct shrinker *shrink,
diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
index 3a92e6af69b2..75461777c466 100644
--- a/fs/ubifs/compress.c
+++ b/fs/ubifs/compress.c
@@ -217,7 +217,6 @@ static void compr_exit(struct ubifs_compressor *compr)
 {
 	if (compr->capi_name)
 		crypto_free_comp(compr->cc);
-	return;
 }
 
 /**
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 1505539f6fe9..ef0499edc248 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -358,7 +358,6 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry)
 	umode_t mode = S_IFCHR | WHITEOUT_MODE;
 	struct inode *inode;
 	struct ubifs_info *c = dir->i_sb->s_fs_info;
-	struct fscrypt_name nm;
 
 	/*
 	 * Create an inode('nlink = 1') for whiteout without updating journal,
@@ -369,10 +368,6 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry)
 	dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
 		dentry, mode, dir->i_ino);
 
-	err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
-	if (err)
-		return ERR_PTR(err);
-
 	inode = ubifs_new_inode(c, dir, mode, false);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
@@ -395,7 +390,6 @@ out_inode:
 	make_bad_inode(inode);
 	iput(inode);
 out_free:
-	fscrypt_free_filename(&nm);
 	ubifs_err(c, "cannot create whiteout file, error %d", err);
 	return ERR_PTR(err);
 }
@@ -492,6 +486,7 @@ static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
 	unlock_2_inodes(dir, inode);
 
 	ubifs_release_budget(c, &req);
+	fscrypt_free_filename(&nm);
 
 	return finish_open_simple(file, 0);
 
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 2469f72eeaab..6b7d95b65f4b 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -44,6 +44,33 @@ enum {
 	NOT_ON_MEDIA = 3,
 };
 
+static void do_insert_old_idx(struct ubifs_info *c,
+			      struct ubifs_old_idx *old_idx)
+{
+	struct ubifs_old_idx *o;
+	struct rb_node **p, *parent = NULL;
+
+	p = &c->old_idx.rb_node;
+	while (*p) {
+		parent = *p;
+		o = rb_entry(parent, struct ubifs_old_idx, rb);
+		if (old_idx->lnum < o->lnum)
+			p = &(*p)->rb_left;
+		else if (old_idx->lnum > o->lnum)
+			p = &(*p)->rb_right;
+		else if (old_idx->offs < o->offs)
+			p = &(*p)->rb_left;
+		else if (old_idx->offs > o->offs)
+			p = &(*p)->rb_right;
+		else {
+			ubifs_err(c, "old idx added twice!");
+			kfree(old_idx);
+		}
+	}
+	rb_link_node(&old_idx->rb, parent, p);
+	rb_insert_color(&old_idx->rb, &c->old_idx);
+}
+
 /**
  * insert_old_idx - record an index node obsoleted since the last commit start.
  * @c: UBIFS file-system description object
@@ -69,35 +96,15 @@ enum {
  */
 static int insert_old_idx(struct ubifs_info *c, int lnum, int offs)
 {
-	struct ubifs_old_idx *old_idx, *o;
-	struct rb_node **p, *parent = NULL;
+	struct ubifs_old_idx *old_idx;
 
 	old_idx = kmalloc(sizeof(struct ubifs_old_idx), GFP_NOFS);
 	if (unlikely(!old_idx))
 		return -ENOMEM;
 	old_idx->lnum = lnum;
 	old_idx->offs = offs;
+	do_insert_old_idx(c, old_idx);
 
-	p = &c->old_idx.rb_node;
-	while (*p) {
-		parent = *p;
-		o = rb_entry(parent, struct ubifs_old_idx, rb);
-		if (lnum < o->lnum)
-			p = &(*p)->rb_left;
-		else if (lnum > o->lnum)
-			p = &(*p)->rb_right;
-		else if (offs < o->offs)
-			p = &(*p)->rb_left;
-		else if (offs > o->offs)
-			p = &(*p)->rb_right;
-		else {
-			ubifs_err(c, "old idx added twice!");
-			kfree(old_idx);
-			return 0;
-		}
-	}
-	rb_link_node(&old_idx->rb, parent, p);
-	rb_insert_color(&old_idx->rb, &c->old_idx);
 	return 0;
 }
 
@@ -199,23 +206,6 @@ static struct ubifs_znode *copy_znode(struct ubifs_info *c,
 	__set_bit(DIRTY_ZNODE, &zn->flags);
 	__clear_bit(COW_ZNODE, &zn->flags);
 
-	ubifs_assert(c, !ubifs_zn_obsolete(znode));
-	__set_bit(OBSOLETE_ZNODE, &znode->flags);
-
-	if (znode->level != 0) {
-		int i;
-		const int n = zn->child_cnt;
-
-		/* The children now have new parent */
-		for (i = 0; i < n; i++) {
-			struct ubifs_zbranch *zbr = &zn->zbranch[i];
-
-			if (zbr->znode)
-				zbr->znode->parent = zn;
-		}
-	}
-
-	atomic_long_inc(&c->dirty_zn_cnt);
 	return zn;
 }
 
@@ -234,6 +224,42 @@ static int add_idx_dirt(struct ubifs_info *c, int lnum, int dirt)
 }
 
 /**
+ * replace_znode - replace old znode with new znode.
+ * @c: UBIFS file-system description object
+ * @new_zn: new znode
+ * @old_zn: old znode
+ * @zbr: the branch of parent znode
+ *
+ * Replace old znode with new znode in TNC.
+ */
+static void replace_znode(struct ubifs_info *c, struct ubifs_znode *new_zn,
+			  struct ubifs_znode *old_zn, struct ubifs_zbranch *zbr)
+{
+	ubifs_assert(c, !ubifs_zn_obsolete(old_zn));
+	__set_bit(OBSOLETE_ZNODE, &old_zn->flags);
+
+	if (old_zn->level != 0) {
+		int i;
+		const int n = new_zn->child_cnt;
+
+		/* The children now have new parent */
+		for (i = 0; i < n; i++) {
+			struct ubifs_zbranch *child = &new_zn->zbranch[i];
+
+			if (child->znode)
+				child->znode->parent = new_zn;
+		}
+	}
+
+	zbr->znode = new_zn;
+	zbr->lnum = 0;
+	zbr->offs = 0;
+	zbr->len = 0;
+
+	atomic_long_inc(&c->dirty_zn_cnt);
+}
+
+/**
  * dirty_cow_znode - ensure a znode is not being committed.
  * @c: UBIFS file-system description object
  * @zbr: branch of znode to check
@@ -265,28 +291,32 @@ static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c,
 		return zn;
 
 	if (zbr->len) {
-		err = insert_old_idx(c, zbr->lnum, zbr->offs);
-		if (unlikely(err))
-			/*
-			 * Obsolete znodes will be freed by tnc_destroy_cnext()
-			 * or free_obsolete_znodes(), copied up znodes should
-			 * be added back to tnc and freed by
-			 * ubifs_destroy_tnc_subtree().
-			 */
+		struct ubifs_old_idx *old_idx;
+
+		old_idx = kmalloc(sizeof(struct ubifs_old_idx), GFP_NOFS);
+		if (unlikely(!old_idx)) {
+			err = -ENOMEM;
 			goto out;
+		}
+		old_idx->lnum = zbr->lnum;
+		old_idx->offs = zbr->offs;
+
 		err = add_idx_dirt(c, zbr->lnum, zbr->len);
-	} else
-		err = 0;
+		if (err) {
+			kfree(old_idx);
+			goto out;
+		}
 
-out:
-	zbr->znode = zn;
-	zbr->lnum = 0;
-	zbr->offs = 0;
-	zbr->len = 0;
+		do_insert_old_idx(c, old_idx);
+	}
+
+	replace_znode(c, zn, znode, zbr);
 
-	if (unlikely(err))
-		return ERR_PTR(err);
 	return zn;
+
+out:
+	kfree(zn);
+	return ERR_PTR(err);
 }
 
 /**
diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
index 67aaadc3ab07..8395066341a4 100644
--- a/fs/unicode/utf8-core.c
+++ b/fs/unicode/utf8-core.c
@@ -214,4 +214,3 @@ void utf8_unload(struct unicode_map *um)
 }
 EXPORT_SYMBOL(utf8_unload);
 
-MODULE_LICENSE("GPL v2");
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 40f9e1a2ebdd..0fd96d6e39ce 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -32,7 +32,22 @@
 #include <linux/swapops.h>
 #include <linux/miscdevice.h>
 
-int sysctl_unprivileged_userfaultfd __read_mostly;
+static int sysctl_unprivileged_userfaultfd __read_mostly;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table vm_userfaultfd_table[] = {
+	{
+		.procname	= "unprivileged_userfaultfd",
+		.data		= &sysctl_unprivileged_userfaultfd,
+		.maxlen		= sizeof(sysctl_unprivileged_userfaultfd),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{ }
+};
+#endif
 
 static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
 
@@ -108,6 +123,21 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
 	return ctx->features & UFFD_FEATURE_INITIALIZED;
 }
 
+/*
+ * Whether WP_UNPOPULATED is enabled on the uffd context.  It is only
+ * meaningful when userfaultfd_wp()==true on the vma and when it's
+ * anonymous.
+ */
+bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
+{
+	struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
+
+	if (!ctx)
+		return false;
+
+	return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
+}
+
 static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
 				     vm_flags_t flags)
 {
@@ -1629,7 +1659,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 
 		/* Reset ptes for the whole vma range if wr-protected */
 		if (userfaultfd_wp(vma))
-			uffd_wp_range(mm, vma, start, vma_end - start, false);
+			uffd_wp_range(vma, start, vma_end - start, false);
 
 		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
 		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
@@ -1714,6 +1744,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
 	struct uffdio_copy uffdio_copy;
 	struct uffdio_copy __user *user_uffdio_copy;
 	struct userfaultfd_wake_range range;
+	uffd_flags_t flags = 0;
 
 	user_uffdio_copy = (struct uffdio_copy __user *) arg;
 
@@ -1740,10 +1771,12 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
 		goto out;
 	if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
 		goto out;
+	if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
+		flags |= MFILL_ATOMIC_WP;
 	if (mmget_not_zero(ctx->mm)) {
-		ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
-				   uffdio_copy.len, &ctx->mmap_changing,
-				   uffdio_copy.mode);
+		ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
+					uffdio_copy.len, &ctx->mmap_changing,
+					flags);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
@@ -1793,9 +1826,9 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
 		goto out;
 
 	if (mmget_not_zero(ctx->mm)) {
-		ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
-				     uffdio_zeropage.range.len,
-				     &ctx->mmap_changing);
+		ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
+					   uffdio_zeropage.range.len,
+					   &ctx->mmap_changing);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
@@ -1875,6 +1908,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
 	struct uffdio_continue uffdio_continue;
 	struct uffdio_continue __user *user_uffdio_continue;
 	struct userfaultfd_wake_range range;
+	uffd_flags_t flags = 0;
 
 	user_uffdio_continue = (struct uffdio_continue __user *)arg;
 
@@ -1899,13 +1933,16 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
 	    uffdio_continue.range.start) {
 		goto out;
 	}
-	if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
+	if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
+				     UFFDIO_CONTINUE_MODE_WP))
 		goto out;
+	if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
+		flags |= MFILL_ATOMIC_WP;
 
 	if (mmget_not_zero(ctx->mm)) {
-		ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
-				     uffdio_continue.range.len,
-				     &ctx->mmap_changing);
+		ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
+					    uffdio_continue.range.len,
+					    &ctx->mmap_changing, flags);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
@@ -1973,6 +2010,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 #endif
 #ifndef CONFIG_PTE_MARKER_UFFD_WP
 	uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
+	uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
 #endif
 	uffdio_api.ioctls = UFFD_API_IOCTLS;
 	ret = -EFAULT;
@@ -2180,6 +2218,9 @@ static int __init userfaultfd_init(void)
 						0,
 						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
 						init_once_userfaultfd_ctx);
+#ifdef CONFIG_SYSCTL
+	register_sysctl_init("vm", vm_userfaultfd_table);
+#endif
 	return 0;
 }
 __initcall(userfaultfd_init);
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 9fac5ea8d0e4..52e1823241fb 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -47,6 +47,33 @@ config XFS_SUPPORT_V4
 	  To continue supporting the old V4 format (crc=0), say Y.
 	  To close off an attack surface, say N.
 
+config XFS_SUPPORT_ASCII_CI
+	bool "Support deprecated case-insensitive ascii (ascii-ci=1) format"
+	depends on XFS_FS
+	default y
+	help
+	  The ASCII case insensitivity filesystem feature only works correctly
+	  on systems that have been coerced into using ISO 8859-1, and it does
+	  not work on extended attributes.  The kernel has no visibility into
+	  the locale settings in userspace, so it corrupts UTF-8 names.
+	  Enabling this feature makes XFS vulnerable to mixed case sensitivity
+	  attacks.  Because of this, the feature is deprecated.  All users
+	  should upgrade by backing up their files, reformatting, and restoring
+	  from the backup.
+
+	  Administrators and users can detect such a filesystem by running
+	  xfs_info against a filesystem mountpoint and checking for a string
+	  beginning with "ascii-ci=".  If the string "ascii-ci=1" is found, the
+	  filesystem is a case-insensitive filesystem.  If no such string is
+	  found, please upgrade xfsprogs to the latest version and try again.
+
+	  This option will become default N in September 2025.  Support for the
+	  feature will be removed entirely in September 2030.  Distributors
+	  can say N here to withdraw support earlier.
+
+	  To continue supporting case-insensitivity (ascii-ci=1), say Y.
+	  To close off an attack surface, say N.
+
 config XFS_QUOTA
 	bool "XFS Quota support"
 	depends on XFS_FS
@@ -93,10 +120,15 @@ config XFS_RT
 
 	  If unsure, say N.
 
+config XFS_DRAIN_INTENTS
+	bool
+	select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL
+
 config XFS_ONLINE_SCRUB
 	bool "XFS online metadata check support"
 	default n
 	depends on XFS_FS
+	select XFS_DRAIN_INTENTS
 	help
 	  If you say Y here you will be able to check metadata on a
 	  mounted XFS filesystem.  This feature is intended to reduce
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 92d88dc3c9f7..16e4eb431230 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -136,6 +136,8 @@ ifeq ($(CONFIG_MEMORY_FAILURE),y)
 xfs-$(CONFIG_FS_DAX)		+= xfs_notify_failure.o
 endif
 
+xfs-$(CONFIG_XFS_DRAIN_INTENTS)	+= xfs_drain.o
+
 # online scrub/repair
 ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)
 
@@ -146,6 +148,7 @@ xfs-y				+= $(addprefix scrub/, \
 				   agheader.o \
 				   alloc.o \
 				   attr.o \
+				   bitmap.o \
 				   bmap.o \
 				   btree.o \
 				   common.o \
@@ -156,6 +159,7 @@ xfs-y				+= $(addprefix scrub/, \
 				   ialloc.o \
 				   inode.o \
 				   parent.o \
+				   readdir.o \
 				   refcount.o \
 				   rmap.o \
 				   scrub.o \
@@ -169,7 +173,6 @@ xfs-$(CONFIG_XFS_QUOTA)		+= scrub/quota.o
 ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
 xfs-y				+= $(addprefix scrub/, \
 				   agheader_repair.o \
-				   bitmap.o \
 				   repair.o \
 				   )
 endif
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 86696a1c6891..1b078bbbf225 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -81,6 +81,19 @@ xfs_perag_get_tag(
 	return pag;
 }
 
+/* Get a passive reference to the given perag. */
+struct xfs_perag *
+xfs_perag_hold(
+	struct xfs_perag	*pag)
+{
+	ASSERT(atomic_read(&pag->pag_ref) > 0 ||
+	       atomic_read(&pag->pag_active_ref) > 0);
+
+	trace_xfs_perag_hold(pag, _RET_IP_);
+	atomic_inc(&pag->pag_ref);
+	return pag;
+}
+
 void
 xfs_perag_put(
 	struct xfs_perag	*pag)
@@ -247,6 +260,7 @@ xfs_free_perag(
 		spin_unlock(&mp->m_perag_lock);
 		ASSERT(pag);
 		XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0);
+		xfs_defer_drain_free(&pag->pag_intents_drain);
 
 		cancel_delayed_work_sync(&pag->pag_blockgc_work);
 		xfs_buf_hash_destroy(pag);
@@ -372,6 +386,7 @@ xfs_initialize_perag(
 		spin_lock_init(&pag->pag_state_lock);
 		INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker);
 		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+		xfs_defer_drain_init(&pag->pag_intents_drain);
 		init_waitqueue_head(&pag->pagb_wait);
 		init_waitqueue_head(&pag->pag_active_wq);
 		pag->pagb_count = 0;
@@ -408,6 +423,7 @@ xfs_initialize_perag(
 	return 0;
 
 out_remove_pag:
+	xfs_defer_drain_free(&pag->pag_intents_drain);
 	radix_tree_delete(&mp->m_perag_tree, index);
 out_free_pag:
 	kmem_free(pag);
@@ -418,6 +434,7 @@ out_unwind_new_pags:
 		if (!pag)
 			break;
 		xfs_buf_hash_destroy(pag);
+		xfs_defer_drain_free(&pag->pag_intents_drain);
 		kmem_free(pag);
 	}
 	return error;
@@ -1043,10 +1060,8 @@ xfs_ag_extend_space(
 	if (error)
 		return error;
 
-	error = xfs_free_extent(tp, XFS_AGB_TO_FSB(pag->pag_mount, pag->pag_agno,
-					be32_to_cpu(agf->agf_length) - len),
-				len, &XFS_RMAP_OINFO_SKIP_UPDATE,
-				XFS_AG_RESV_NONE);
+	error = xfs_free_extent(tp, pag, be32_to_cpu(agf->agf_length) - len,
+			len, &XFS_RMAP_OINFO_SKIP_UPDATE, XFS_AG_RESV_NONE);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 5e18536dfdce..2e0aef87d633 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -101,6 +101,14 @@ struct xfs_perag {
 	/* background prealloc block trimming */
 	struct delayed_work	pag_blockgc_work;
 
+	/*
+	 * We use xfs_drain to track the number of deferred log intent items
+	 * that have been queued (but not yet processed) so that waiters (e.g.
+	 * scrub) will not lock resources when other threads are in the middle
+	 * of processing a chain of intent items only to find momentary
+	 * inconsistencies.
+	 */
+	struct xfs_defer_drain	pag_intents_drain;
 #endif /* __KERNEL__ */
 };
 
@@ -134,6 +142,7 @@ void xfs_free_perag(struct xfs_mount *mp);
 struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
 struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
 		unsigned int tag);
+struct xfs_perag *xfs_perag_hold(struct xfs_perag *pag);
 void xfs_perag_put(struct xfs_perag *pag);
 
 /* Active AG references */
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 203f16c48c19..fdfa08cbf4db 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -233,6 +233,52 @@ xfs_alloc_update(
 	return xfs_btree_update(cur, &rec);
 }
 
+/* Convert the ondisk btree record to its incore representation. */
+void
+xfs_alloc_btrec_to_irec(
+	const union xfs_btree_rec	*rec,
+	struct xfs_alloc_rec_incore	*irec)
+{
+	irec->ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
+	irec->ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
+}
+
+/* Simple checks for free space records. */
+xfs_failaddr_t
+xfs_alloc_check_irec(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_alloc_rec_incore *irec)
+{
+	struct xfs_perag		*pag = cur->bc_ag.pag;
+
+	if (irec->ar_blockcount == 0)
+		return __this_address;
+
+	/* check for valid extent range, including overflow */
+	if (!xfs_verify_agbext(pag, irec->ar_startblock, irec->ar_blockcount))
+		return __this_address;
+
+	return NULL;
+}
+
+static inline int
+xfs_alloc_complain_bad_rec(
+	struct xfs_btree_cur		*cur,
+	xfs_failaddr_t			fa,
+	const struct xfs_alloc_rec_incore *irec)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+
+	xfs_warn(mp,
+		"%s Freespace BTree record corruption in AG %d detected at %pS!",
+		cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size",
+		cur->bc_ag.pag->pag_agno, fa);
+	xfs_warn(mp,
+		"start block 0x%x block count 0x%x", irec->ar_startblock,
+		irec->ar_blockcount);
+	return -EFSCORRUPTED;
+}
+
 /*
  * Get the data from the pointed-to record.
  */
@@ -243,35 +289,23 @@ xfs_alloc_get_rec(
 	xfs_extlen_t		*len,	/* output: length of extent */
 	int			*stat)	/* output: success/failure */
 {
-	struct xfs_mount	*mp = cur->bc_mp;
-	struct xfs_perag	*pag = cur->bc_ag.pag;
+	struct xfs_alloc_rec_incore irec;
 	union xfs_btree_rec	*rec;
+	xfs_failaddr_t		fa;
 	int			error;
 
 	error = xfs_btree_get_rec(cur, &rec, stat);
 	if (error || !(*stat))
 		return error;
 
-	*bno = be32_to_cpu(rec->alloc.ar_startblock);
-	*len = be32_to_cpu(rec->alloc.ar_blockcount);
-
-	if (*len == 0)
-		goto out_bad_rec;
-
-	/* check for valid extent range, including overflow */
-	if (!xfs_verify_agbext(pag, *bno, *len))
-		goto out_bad_rec;
+	xfs_alloc_btrec_to_irec(rec, &irec);
+	fa = xfs_alloc_check_irec(cur, &irec);
+	if (fa)
+		return xfs_alloc_complain_bad_rec(cur, fa, &irec);
 
+	*bno = irec.ar_startblock;
+	*len = irec.ar_blockcount;
 	return 0;
-
-out_bad_rec:
-	xfs_warn(mp,
-		"%s Freespace BTree record corruption in AG %d detected!",
-		cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size",
-		pag->pag_agno);
-	xfs_warn(mp,
-		"start block 0x%x block count 0x%x", *bno, *len);
-	return -EFSCORRUPTED;
 }
 
 /*
@@ -2405,6 +2439,7 @@ xfs_defer_agfl_block(
 
 	trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
 
+	xfs_extent_free_get_group(mp, xefi);
 	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list);
 }
 
@@ -2421,8 +2456,8 @@ __xfs_free_extent_later(
 	bool				skip_discard)
 {
 	struct xfs_extent_free_item	*xefi;
-#ifdef DEBUG
 	struct xfs_mount		*mp = tp->t_mountp;
+#ifdef DEBUG
 	xfs_agnumber_t			agno;
 	xfs_agblock_t			agbno;
 
@@ -2456,9 +2491,11 @@ __xfs_free_extent_later(
 	} else {
 		xefi->xefi_owner = XFS_RMAP_OWN_NULL;
 	}
-	trace_xfs_bmap_free_defer(tp->t_mountp,
+	trace_xfs_bmap_free_defer(mp,
 			XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
 			XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
+
+	xfs_extent_free_get_group(mp, xefi);
 	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list);
 }
 
@@ -3596,7 +3633,8 @@ xfs_free_extent_fix_freelist(
 int
 __xfs_free_extent(
 	struct xfs_trans		*tp,
-	xfs_fsblock_t			bno,
+	struct xfs_perag		*pag,
+	xfs_agblock_t			agbno,
 	xfs_extlen_t			len,
 	const struct xfs_owner_info	*oinfo,
 	enum xfs_ag_resv_type		type,
@@ -3604,12 +3642,9 @@ __xfs_free_extent(
 {
 	struct xfs_mount		*mp = tp->t_mountp;
 	struct xfs_buf			*agbp;
-	xfs_agnumber_t			agno = XFS_FSB_TO_AGNO(mp, bno);
-	xfs_agblock_t			agbno = XFS_FSB_TO_AGBNO(mp, bno);
 	struct xfs_agf			*agf;
 	int				error;
 	unsigned int			busy_flags = 0;
-	struct xfs_perag		*pag;
 
 	ASSERT(len != 0);
 	ASSERT(type != XFS_AG_RESV_AGFL);
@@ -3618,10 +3653,9 @@ __xfs_free_extent(
 			XFS_ERRTAG_FREE_EXTENT))
 		return -EIO;
 
-	pag = xfs_perag_get(mp, agno);
 	error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
 	if (error)
-		goto err;
+		return error;
 	agf = agbp->b_addr;
 
 	if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
@@ -3635,20 +3669,18 @@ __xfs_free_extent(
 		goto err_release;
 	}
 
-	error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
+	error = xfs_free_ag_extent(tp, agbp, pag->pag_agno, agbno, len, oinfo,
+			type);
 	if (error)
 		goto err_release;
 
 	if (skip_discard)
 		busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
 	xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags);
-	xfs_perag_put(pag);
 	return 0;
 
 err_release:
 	xfs_trans_brelse(tp, agbp);
-err:
-	xfs_perag_put(pag);
 	return error;
 }
 
@@ -3666,9 +3698,13 @@ xfs_alloc_query_range_helper(
 {
 	struct xfs_alloc_query_range_info	*query = priv;
 	struct xfs_alloc_rec_incore		irec;
+	xfs_failaddr_t				fa;
+
+	xfs_alloc_btrec_to_irec(rec, &irec);
+	fa = xfs_alloc_check_irec(cur, &irec);
+	if (fa)
+		return xfs_alloc_complain_bad_rec(cur, fa, &irec);
 
-	irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
-	irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
 	return query->fn(cur, &irec, query->priv);
 }
 
@@ -3709,13 +3745,16 @@ xfs_alloc_query_all(
 	return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
 }
 
-/* Is there a record covering a given extent? */
+/*
+ * Scan part of the keyspace of the free space and tell us if the area has no
+ * records, is fully mapped by records, or is partially filled.
+ */
 int
-xfs_alloc_has_record(
+xfs_alloc_has_records(
 	struct xfs_btree_cur	*cur,
 	xfs_agblock_t		bno,
 	xfs_extlen_t		len,
-	bool			*exists)
+	enum xbtree_recpacking	*outcome)
 {
 	union xfs_btree_irec	low;
 	union xfs_btree_irec	high;
@@ -3725,7 +3764,7 @@ xfs_alloc_has_record(
 	memset(&high, 0xFF, sizeof(high));
 	high.a.ar_startblock = bno + len - 1;
 
-	return xfs_btree_has_record(cur, &low, &high, exists);
+	return xfs_btree_has_records(cur, &low, &high, NULL, outcome);
 }
 
 /*
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 2b246d74c189..5dbb25546d0b 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -141,7 +141,8 @@ int xfs_alloc_vextent_first_ag(struct xfs_alloc_arg *args,
 int				/* error */
 __xfs_free_extent(
 	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_fsblock_t		bno,	/* starting block number of extent */
+	struct xfs_perag	*pag,
+	xfs_agblock_t		agbno,
 	xfs_extlen_t		len,	/* length of extent */
 	const struct xfs_owner_info	*oinfo,	/* extent owner */
 	enum xfs_ag_resv_type	type,	/* block reservation type */
@@ -150,12 +151,13 @@ __xfs_free_extent(
 static inline int
 xfs_free_extent(
 	struct xfs_trans	*tp,
-	xfs_fsblock_t		bno,
+	struct xfs_perag	*pag,
+	xfs_agblock_t		agbno,
 	xfs_extlen_t		len,
 	const struct xfs_owner_info	*oinfo,
 	enum xfs_ag_resv_type	type)
 {
-	return __xfs_free_extent(tp, bno, len, oinfo, type, false);
+	return __xfs_free_extent(tp, pag, agbno, len, oinfo, type, false);
 }
 
 int				/* error */
@@ -179,6 +181,12 @@ xfs_alloc_get_rec(
 	xfs_extlen_t		*len,	/* output: length of extent */
 	int			*stat);	/* output: success/failure */
 
+union xfs_btree_rec;
+void xfs_alloc_btrec_to_irec(const union xfs_btree_rec *rec,
+		struct xfs_alloc_rec_incore *irec);
+xfs_failaddr_t xfs_alloc_check_irec(struct xfs_btree_cur *cur,
+		const struct xfs_alloc_rec_incore *irec);
+
 int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags,
 		struct xfs_buf **agfbpp);
 int xfs_alloc_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags,
@@ -205,8 +213,8 @@ int xfs_alloc_query_range(struct xfs_btree_cur *cur,
 int xfs_alloc_query_all(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn,
 		void *priv);
 
-int xfs_alloc_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno,
-		xfs_extlen_t len, bool *exist);
+int xfs_alloc_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+		xfs_extlen_t len, enum xbtree_recpacking *outcome);
 
 typedef int (*xfs_agfl_walk_fn)(struct xfs_mount *mp, xfs_agblock_t bno,
 		void *priv);
@@ -235,9 +243,13 @@ struct xfs_extent_free_item {
 	uint64_t		xefi_owner;
 	xfs_fsblock_t		xefi_startblock;/* starting fs block number */
 	xfs_extlen_t		xefi_blockcount;/* number of blocks in extent */
+	struct xfs_perag	*xefi_pag;
 	unsigned int		xefi_flags;
 };
 
+void xfs_extent_free_get_group(struct xfs_mount *mp,
+		struct xfs_extent_free_item *xefi);
+
 #define XFS_EFI_SKIP_DISCARD	(1U << 0) /* don't issue discard */
 #define XFS_EFI_ATTR_FORK	(1U << 1) /* freeing attr fork block */
 #define XFS_EFI_BMBT_BLOCK	(1U << 2) /* freeing bmap btree block */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 0f29c7b1b39f..c65228efed4a 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -260,20 +260,27 @@ STATIC int64_t
 xfs_bnobt_diff_two_keys(
 	struct xfs_btree_cur		*cur,
 	const union xfs_btree_key	*k1,
-	const union xfs_btree_key	*k2)
+	const union xfs_btree_key	*k2,
+	const union xfs_btree_key	*mask)
 {
+	ASSERT(!mask || mask->alloc.ar_startblock);
+
 	return (int64_t)be32_to_cpu(k1->alloc.ar_startblock) -
-			  be32_to_cpu(k2->alloc.ar_startblock);
+			be32_to_cpu(k2->alloc.ar_startblock);
 }
 
 STATIC int64_t
 xfs_cntbt_diff_two_keys(
 	struct xfs_btree_cur		*cur,
 	const union xfs_btree_key	*k1,
-	const union xfs_btree_key	*k2)
+	const union xfs_btree_key	*k2,
+	const union xfs_btree_key	*mask)
 {
 	int64_t				diff;
 
+	ASSERT(!mask || (mask->alloc.ar_blockcount &&
+			 mask->alloc.ar_startblock));
+
 	diff =  be32_to_cpu(k1->alloc.ar_blockcount) -
 		be32_to_cpu(k2->alloc.ar_blockcount);
 	if (diff)
@@ -423,6 +430,19 @@ xfs_cntbt_recs_inorder(
 		 be32_to_cpu(r2->alloc.ar_startblock));
 }
 
+STATIC enum xbtree_key_contig
+xfs_allocbt_keys_contiguous(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key1,
+	const union xfs_btree_key	*key2,
+	const union xfs_btree_key	*mask)
+{
+	ASSERT(!mask || mask->alloc.ar_startblock);
+
+	return xbtree_key_contig(be32_to_cpu(key1->alloc.ar_startblock),
+				 be32_to_cpu(key2->alloc.ar_startblock));
+}
+
 static const struct xfs_btree_ops xfs_bnobt_ops = {
 	.rec_len		= sizeof(xfs_alloc_rec_t),
 	.key_len		= sizeof(xfs_alloc_key_t),
@@ -443,6 +463,7 @@ static const struct xfs_btree_ops xfs_bnobt_ops = {
 	.diff_two_keys		= xfs_bnobt_diff_two_keys,
 	.keys_inorder		= xfs_bnobt_keys_inorder,
 	.recs_inorder		= xfs_bnobt_recs_inorder,
+	.keys_contiguous	= xfs_allocbt_keys_contiguous,
 };
 
 static const struct xfs_btree_ops xfs_cntbt_ops = {
@@ -465,6 +486,7 @@ static const struct xfs_btree_ops xfs_cntbt_ops = {
 	.diff_two_keys		= xfs_cntbt_diff_two_keys,
 	.keys_inorder		= xfs_cntbt_keys_inorder,
 	.recs_inorder		= xfs_cntbt_recs_inorder,
+	.keys_contiguous	= NULL, /* not needed right now */
 };
 
 /* Allocate most of a new allocation btree cursor. */
@@ -492,9 +514,7 @@ xfs_allocbt_init_common(
 		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
 	}
 
-	/* take a reference for the cursor */
-	atomic_inc(&pag->pag_ref);
-	cur->bc_ag.pag = pag;
+	cur->bc_ag.pag = xfs_perag_hold(pag);
 
 	if (xfs_has_crc(mp))
 		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 34de6e6898c4..b512de0540d5 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1083,6 +1083,34 @@ struct xfs_iread_state {
 	xfs_extnum_t		loaded;
 };
 
+int
+xfs_bmap_complain_bad_rec(
+	struct xfs_inode		*ip,
+	int				whichfork,
+	xfs_failaddr_t			fa,
+	const struct xfs_bmbt_irec	*irec)
+{
+	struct xfs_mount		*mp = ip->i_mount;
+	const char			*forkname;
+
+	switch (whichfork) {
+	case XFS_DATA_FORK:	forkname = "data"; break;
+	case XFS_ATTR_FORK:	forkname = "attr"; break;
+	case XFS_COW_FORK:	forkname = "CoW"; break;
+	default:		forkname = "???"; break;
+	}
+
+	xfs_warn(mp,
+ "Bmap BTree record corruption in inode 0x%llx %s fork detected at %pS!",
+				ip->i_ino, forkname, fa);
+	xfs_warn(mp,
+		"Offset 0x%llx, start block 0x%llx, block count 0x%llx state 0x%x",
+		irec->br_startoff, irec->br_startblock, irec->br_blockcount,
+		irec->br_state);
+
+	return -EFSCORRUPTED;
+}
+
 /* Stuff every bmbt record from this block into the incore extent map. */
 static int
 xfs_iread_bmbt_block(
@@ -1125,7 +1153,8 @@ xfs_iread_bmbt_block(
 			xfs_inode_verifier_error(ip, -EFSCORRUPTED,
 					"xfs_iread_extents(2)", frp,
 					sizeof(*frp), fa);
-			return -EFSCORRUPTED;
+			return xfs_bmap_complain_bad_rec(ip, whichfork, fa,
+					&new);
 		}
 		xfs_iext_insert(ip, &ir->icur, &new,
 				xfs_bmap_fork_to_state(whichfork));
@@ -1171,6 +1200,12 @@ xfs_iread_extents(
 		goto out;
 	}
 	ASSERT(ir.loaded == xfs_iext_count(ifp));
+	/*
+	 * Use release semantics so that we can use acquire semantics in
+	 * xfs_need_iread_extents and be guaranteed to see a valid mapping tree
+	 * after that load.
+	 */
+	smp_store_release(&ifp->if_needextents, 0);
 	return 0;
 out:
 	xfs_iext_destroy(ifp);
@@ -3505,7 +3540,6 @@ xfs_bmap_btalloc_at_eof(
 	 * original non-aligned state so the caller can proceed on allocation
 	 * failure as if this function was never called.
 	 */
-	args->fsbno = ap->blkno;
 	args->alignment = 1;
 	return 0;
 }
@@ -6075,6 +6109,7 @@ __xfs_bmap_add(
 	bi->bi_whichfork = whichfork;
 	bi->bi_bmap = *bmap;
 
+	xfs_bmap_update_get_group(tp->t_mountp, bi);
 	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list);
 	return 0;
 }
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index dd08361ca5a6..e33470e39728 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -145,7 +145,7 @@ static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags)
 	{ BMAP_COWFORK,		"COW" }
 
 /* Return true if the extent is an allocated extent, written or not. */
-static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
+static inline bool xfs_bmap_is_real_extent(const struct xfs_bmbt_irec *irec)
 {
 	return irec->br_startblock != HOLESTARTBLOCK &&
 		irec->br_startblock != DELAYSTARTBLOCK &&
@@ -238,9 +238,13 @@ struct xfs_bmap_intent {
 	enum xfs_bmap_intent_type		bi_type;
 	int					bi_whichfork;
 	struct xfs_inode			*bi_owner;
+	struct xfs_perag			*bi_pag;
 	struct xfs_bmbt_irec			bi_bmap;
 };
 
+void xfs_bmap_update_get_group(struct xfs_mount *mp,
+		struct xfs_bmap_intent *bi);
+
 int	xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
 void	xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 		struct xfs_bmbt_irec *imap);
@@ -261,6 +265,8 @@ static inline uint32_t xfs_bmap_fork_to_state(int whichfork)
 
 xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork,
 		struct xfs_bmbt_irec *irec);
+int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork,
+		xfs_failaddr_t fa, const struct xfs_bmbt_irec *irec);
 
 int	xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip,
 		xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock,
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index b8ad95050c9b..1b40e5f8b1ec 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -382,11 +382,14 @@ STATIC int64_t
 xfs_bmbt_diff_two_keys(
 	struct xfs_btree_cur		*cur,
 	const union xfs_btree_key	*k1,
-	const union xfs_btree_key	*k2)
+	const union xfs_btree_key	*k2,
+	const union xfs_btree_key	*mask)
 {
 	uint64_t			a = be64_to_cpu(k1->bmbt.br_startoff);
 	uint64_t			b = be64_to_cpu(k2->bmbt.br_startoff);
 
+	ASSERT(!mask || mask->bmbt.br_startoff);
+
 	/*
 	 * Note: This routine previously casted a and b to int64 and subtracted
 	 * them to generate a result.  This lead to problems if b was the
@@ -500,6 +503,19 @@ xfs_bmbt_recs_inorder(
 		xfs_bmbt_disk_get_startoff(&r2->bmbt);
 }
 
+STATIC enum xbtree_key_contig
+xfs_bmbt_keys_contiguous(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key1,
+	const union xfs_btree_key	*key2,
+	const union xfs_btree_key	*mask)
+{
+	ASSERT(!mask || mask->bmbt.br_startoff);
+
+	return xbtree_key_contig(be64_to_cpu(key1->bmbt.br_startoff),
+				 be64_to_cpu(key2->bmbt.br_startoff));
+}
+
 static const struct xfs_btree_ops xfs_bmbt_ops = {
 	.rec_len		= sizeof(xfs_bmbt_rec_t),
 	.key_len		= sizeof(xfs_bmbt_key_t),
@@ -520,6 +536,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
 	.buf_ops		= &xfs_bmbt_buf_ops,
 	.keys_inorder		= xfs_bmbt_keys_inorder,
 	.recs_inorder		= xfs_bmbt_recs_inorder,
+	.keys_contiguous	= xfs_bmbt_keys_contiguous,
 };
 
 /*
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index c4649cc624e1..6a6503ab0cd7 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2067,8 +2067,7 @@ xfs_btree_get_leaf_keys(
 		for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
 			rec = xfs_btree_rec_addr(cur, n, block);
 			cur->bc_ops->init_high_key_from_rec(&hkey, rec);
-			if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey)
-					> 0)
+			if (xfs_btree_keycmp_gt(cur, &hkey, &max_hkey))
 				max_hkey = hkey;
 		}
 
@@ -2096,7 +2095,7 @@ xfs_btree_get_node_keys(
 		max_hkey = xfs_btree_high_key_addr(cur, 1, block);
 		for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
 			hkey = xfs_btree_high_key_addr(cur, n, block);
-			if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0)
+			if (xfs_btree_keycmp_gt(cur, hkey, max_hkey))
 				max_hkey = hkey;
 		}
 
@@ -2183,8 +2182,8 @@ __xfs_btree_updkeys(
 		nlkey = xfs_btree_key_addr(cur, ptr, block);
 		nhkey = xfs_btree_high_key_addr(cur, ptr, block);
 		if (!force_all &&
-		    !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 ||
-		      cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0))
+		    xfs_btree_keycmp_eq(cur, nlkey, lkey) &&
+		    xfs_btree_keycmp_eq(cur, nhkey, hkey))
 			break;
 		xfs_btree_copy_keys(cur, nlkey, lkey, 1);
 		xfs_btree_log_keys(cur, bp, ptr, ptr);
@@ -4716,7 +4715,6 @@ xfs_btree_simple_query_range(
 {
 	union xfs_btree_rec		*recp;
 	union xfs_btree_key		rec_key;
-	int64_t				diff;
 	int				stat;
 	bool				firstrec = true;
 	int				error;
@@ -4746,20 +4744,17 @@ xfs_btree_simple_query_range(
 		if (error || !stat)
 			break;
 
-		/* Skip if high_key(rec) < low_key. */
+		/* Skip if low_key > high_key(rec). */
 		if (firstrec) {
 			cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
 			firstrec = false;
-			diff = cur->bc_ops->diff_two_keys(cur, low_key,
-					&rec_key);
-			if (diff > 0)
+			if (xfs_btree_keycmp_gt(cur, low_key, &rec_key))
 				goto advloop;
 		}
 
-		/* Stop if high_key < low_key(rec). */
+		/* Stop if low_key(rec) > high_key. */
 		cur->bc_ops->init_key_from_rec(&rec_key, recp);
-		diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
-		if (diff > 0)
+		if (xfs_btree_keycmp_gt(cur, &rec_key, high_key))
 			break;
 
 		/* Callback */
@@ -4813,8 +4808,6 @@ xfs_btree_overlapped_query_range(
 	union xfs_btree_key		*hkp;
 	union xfs_btree_rec		*recp;
 	struct xfs_btree_block		*block;
-	int64_t				ldiff;
-	int64_t				hdiff;
 	int				level;
 	struct xfs_buf			*bp;
 	int				i;
@@ -4854,25 +4847,23 @@ pop_up:
 					block);
 
 			cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
-			ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
-					low_key);
-
 			cur->bc_ops->init_key_from_rec(&rec_key, recp);
-			hdiff = cur->bc_ops->diff_two_keys(cur, high_key,
-					&rec_key);
 
 			/*
+			 * If (query's high key < record's low key), then there
+			 * are no more interesting records in this block.  Pop
+			 * up to the leaf level to find more record blocks.
+			 *
 			 * If (record's high key >= query's low key) and
 			 *    (query's high key >= record's low key), then
 			 * this record overlaps the query range; callback.
 			 */
-			if (ldiff >= 0 && hdiff >= 0) {
+			if (xfs_btree_keycmp_lt(cur, high_key, &rec_key))
+				goto pop_up;
+			if (xfs_btree_keycmp_ge(cur, &rec_hkey, low_key)) {
 				error = fn(cur, recp, priv);
 				if (error)
 					break;
-			} else if (hdiff < 0) {
-				/* Record is larger than high key; pop. */
-				goto pop_up;
 			}
 			cur->bc_levels[level].ptr++;
 			continue;
@@ -4884,15 +4875,18 @@ pop_up:
 				block);
 		pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
 
-		ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
-		hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
-
 		/*
+		 * If (query's high key < pointer's low key), then there are no
+		 * more interesting keys in this block.  Pop up one leaf level
+		 * to continue looking for records.
+		 *
 		 * If (pointer's high key >= query's low key) and
 		 *    (query's high key >= pointer's low key), then
 		 * this record overlaps the query range; follow pointer.
 		 */
-		if (ldiff >= 0 && hdiff >= 0) {
+		if (xfs_btree_keycmp_lt(cur, high_key, lkp))
+			goto pop_up;
+		if (xfs_btree_keycmp_ge(cur, hkp, low_key)) {
 			level--;
 			error = xfs_btree_lookup_get_block(cur, level, pp,
 					&block);
@@ -4907,9 +4901,6 @@ pop_up:
 #endif
 			cur->bc_levels[level].ptr = 1;
 			continue;
-		} else if (hdiff < 0) {
-			/* The low key is larger than the upper range; pop. */
-			goto pop_up;
 		}
 		cur->bc_levels[level].ptr++;
 	}
@@ -4937,6 +4928,19 @@ out:
 	return error;
 }
 
+static inline void
+xfs_btree_key_from_irec(
+	struct xfs_btree_cur		*cur,
+	union xfs_btree_key		*key,
+	const union xfs_btree_irec	*irec)
+{
+	union xfs_btree_rec		rec;
+
+	cur->bc_rec = *irec;
+	cur->bc_ops->init_rec_from_cur(cur, &rec);
+	cur->bc_ops->init_key_from_rec(key, &rec);
+}
+
 /*
  * Query a btree for all records overlapping a given interval of keys.  The
  * supplied function will be called with each record found; return one of the
@@ -4951,21 +4955,15 @@ xfs_btree_query_range(
 	xfs_btree_query_range_fn	fn,
 	void				*priv)
 {
-	union xfs_btree_rec		rec;
 	union xfs_btree_key		low_key;
 	union xfs_btree_key		high_key;
 
 	/* Find the keys of both ends of the interval. */
-	cur->bc_rec = *high_rec;
-	cur->bc_ops->init_rec_from_cur(cur, &rec);
-	cur->bc_ops->init_key_from_rec(&high_key, &rec);
+	xfs_btree_key_from_irec(cur, &high_key, high_rec);
+	xfs_btree_key_from_irec(cur, &low_key, low_rec);
 
-	cur->bc_rec = *low_rec;
-	cur->bc_ops->init_rec_from_cur(cur, &rec);
-	cur->bc_ops->init_key_from_rec(&low_key, &rec);
-
-	/* Enforce low key < high key. */
-	if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0)
+	/* Enforce low key <= high key. */
+	if (!xfs_btree_keycmp_le(cur, &low_key, &high_key))
 		return -EINVAL;
 
 	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
@@ -5027,34 +5025,132 @@ xfs_btree_diff_two_ptrs(
 	return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
 }
 
-/* If there's an extent, we're done. */
+struct xfs_btree_has_records {
+	/* Keys for the start and end of the range we want to know about. */
+	union xfs_btree_key		start_key;
+	union xfs_btree_key		end_key;
+
+	/* Mask for key comparisons, if desired. */
+	const union xfs_btree_key	*key_mask;
+
+	/* Highest record key we've seen so far. */
+	union xfs_btree_key		high_key;
+
+	enum xbtree_recpacking		outcome;
+};
+
 STATIC int
-xfs_btree_has_record_helper(
+xfs_btree_has_records_helper(
 	struct xfs_btree_cur		*cur,
 	const union xfs_btree_rec	*rec,
 	void				*priv)
 {
-	return -ECANCELED;
+	union xfs_btree_key		rec_key;
+	union xfs_btree_key		rec_high_key;
+	struct xfs_btree_has_records	*info = priv;
+	enum xbtree_key_contig		key_contig;
+
+	cur->bc_ops->init_key_from_rec(&rec_key, rec);
+
+	if (info->outcome == XBTREE_RECPACKING_EMPTY) {
+		info->outcome = XBTREE_RECPACKING_SPARSE;
+
+		/*
+		 * If the first record we find does not overlap the start key,
+		 * then there is a hole at the start of the search range.
+		 * Classify this as sparse and stop immediately.
+		 */
+		if (xfs_btree_masked_keycmp_lt(cur, &info->start_key, &rec_key,
+					info->key_mask))
+			return -ECANCELED;
+	} else {
+		/*
+		 * If a subsequent record does not overlap with the any record
+		 * we've seen so far, there is a hole in the middle of the
+		 * search range.  Classify this as sparse and stop.
+		 * If the keys overlap and this btree does not allow overlap,
+		 * signal corruption.
+		 */
+		key_contig = cur->bc_ops->keys_contiguous(cur, &info->high_key,
+					&rec_key, info->key_mask);
+		if (key_contig == XBTREE_KEY_OVERLAP &&
+				!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+			return -EFSCORRUPTED;
+		if (key_contig == XBTREE_KEY_GAP)
+			return -ECANCELED;
+	}
+
+	/*
+	 * If high_key(rec) is larger than any other high key we've seen,
+	 * remember it for later.
+	 */
+	cur->bc_ops->init_high_key_from_rec(&rec_high_key, rec);
+	if (xfs_btree_masked_keycmp_gt(cur, &rec_high_key, &info->high_key,
+				info->key_mask))
+		info->high_key = rec_high_key; /* struct copy */
+
+	return 0;
 }
 
-/* Is there a record covering a given range of keys? */
+/*
+ * Scan part of the keyspace of a btree and tell us if that keyspace does not
+ * map to any records; is fully mapped to records; or is partially mapped to
+ * records.  This is the btree record equivalent to determining if a file is
+ * sparse.
+ *
+ * For most btree types, the record scan should use all available btree key
+ * fields to compare the keys encountered.  These callers should pass NULL for
+ * @mask.  However, some callers (e.g.  scanning physical space in the rmapbt)
+ * want to ignore some part of the btree record keyspace when performing the
+ * comparison.  These callers should pass in a union xfs_btree_key object with
+ * the fields that *should* be a part of the comparison set to any nonzero
+ * value, and the rest zeroed.
+ */
 int
-xfs_btree_has_record(
+xfs_btree_has_records(
 	struct xfs_btree_cur		*cur,
 	const union xfs_btree_irec	*low,
 	const union xfs_btree_irec	*high,
-	bool				*exists)
+	const union xfs_btree_key	*mask,
+	enum xbtree_recpacking		*outcome)
 {
+	struct xfs_btree_has_records	info = {
+		.outcome		= XBTREE_RECPACKING_EMPTY,
+		.key_mask		= mask,
+	};
 	int				error;
 
-	error = xfs_btree_query_range(cur, low, high,
-			&xfs_btree_has_record_helper, NULL);
-	if (error == -ECANCELED) {
-		*exists = true;
-		return 0;
+	/* Not all btrees support this operation. */
+	if (!cur->bc_ops->keys_contiguous) {
+		ASSERT(0);
+		return -EOPNOTSUPP;
 	}
-	*exists = false;
-	return error;
+
+	xfs_btree_key_from_irec(cur, &info.start_key, low);
+	xfs_btree_key_from_irec(cur, &info.end_key, high);
+
+	error = xfs_btree_query_range(cur, low, high,
+			xfs_btree_has_records_helper, &info);
+	if (error == -ECANCELED)
+		goto out;
+	if (error)
+		return error;
+
+	if (info.outcome == XBTREE_RECPACKING_EMPTY)
+		goto out;
+
+	/*
+	 * If the largest high_key(rec) we saw during the walk is greater than
+	 * the end of the search range, classify this as full.  Otherwise,
+	 * there is a hole at the end of the search range.
+	 */
+	if (xfs_btree_masked_keycmp_ge(cur, &info.high_key, &info.end_key,
+				mask))
+		info.outcome = XBTREE_RECPACKING_FULL;
+
+out:
+	*outcome = info.outcome;
+	return 0;
 }
 
 /* Are there more records in this btree? */
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 29c4b4ccb909..a2aa36b23e25 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -90,6 +90,27 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
 #define XFS_BTREE_STATS_ADD(cur, stat, val)	\
 	XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val)
 
+enum xbtree_key_contig {
+	XBTREE_KEY_GAP = 0,
+	XBTREE_KEY_CONTIGUOUS,
+	XBTREE_KEY_OVERLAP,
+};
+
+/*
+ * Decide if these two numeric btree key fields are contiguous, overlapping,
+ * or if there's a gap between them.  @x should be the field from the high
+ * key and @y should be the field from the low key.
+ */
+static inline enum xbtree_key_contig xbtree_key_contig(uint64_t x, uint64_t y)
+{
+	x++;
+	if (x < y)
+		return XBTREE_KEY_GAP;
+	if (x == y)
+		return XBTREE_KEY_CONTIGUOUS;
+	return XBTREE_KEY_OVERLAP;
+}
+
 struct xfs_btree_ops {
 	/* size of the key and record structures */
 	size_t	key_len;
@@ -140,11 +161,14 @@ struct xfs_btree_ops {
 
 	/*
 	 * Difference between key2 and key1 -- positive if key1 > key2,
-	 * negative if key1 < key2, and zero if equal.
+	 * negative if key1 < key2, and zero if equal.  If the @mask parameter
+	 * is non NULL, each key field to be used in the comparison must
+	 * contain a nonzero value.
 	 */
 	int64_t (*diff_two_keys)(struct xfs_btree_cur *cur,
 				 const union xfs_btree_key *key1,
-				 const union xfs_btree_key *key2);
+				 const union xfs_btree_key *key2,
+				 const union xfs_btree_key *mask);
 
 	const struct xfs_buf_ops	*buf_ops;
 
@@ -157,6 +181,22 @@ struct xfs_btree_ops {
 	int	(*recs_inorder)(struct xfs_btree_cur *cur,
 				const union xfs_btree_rec *r1,
 				const union xfs_btree_rec *r2);
+
+	/*
+	 * Are these two btree keys immediately adjacent?
+	 *
+	 * Given two btree keys @key1 and @key2, decide if it is impossible for
+	 * there to be a third btree key K satisfying the relationship
+	 * @key1 < K < @key2.  To determine if two btree records are
+	 * immediately adjacent, @key1 should be the high key of the first
+	 * record and @key2 should be the low key of the second record.
+	 * If the @mask parameter is non NULL, each key field to be used in the
+	 * comparison must contain a nonzero value.
+	 */
+	enum xbtree_key_contig (*keys_contiguous)(struct xfs_btree_cur *cur,
+			       const union xfs_btree_key *key1,
+			       const union xfs_btree_key *key2,
+			       const union xfs_btree_key *mask);
 };
 
 /*
@@ -540,12 +580,105 @@ void xfs_btree_get_keys(struct xfs_btree_cur *cur,
 		struct xfs_btree_block *block, union xfs_btree_key *key);
 union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur,
 		union xfs_btree_key *key);
-int xfs_btree_has_record(struct xfs_btree_cur *cur,
+typedef bool (*xfs_btree_key_gap_fn)(struct xfs_btree_cur *cur,
+		const union xfs_btree_key *key1,
+		const union xfs_btree_key *key2);
+
+int xfs_btree_has_records(struct xfs_btree_cur *cur,
 		const union xfs_btree_irec *low,
-		const union xfs_btree_irec *high, bool *exists);
+		const union xfs_btree_irec *high,
+		const union xfs_btree_key *mask,
+		enum xbtree_recpacking *outcome);
+
 bool xfs_btree_has_more_records(struct xfs_btree_cur *cur);
 struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur);
 
+/* Key comparison helpers */
+static inline bool
+xfs_btree_keycmp_lt(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key1,
+	const union xfs_btree_key	*key2)
+{
+	return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) < 0;
+}
+
+static inline bool
+xfs_btree_keycmp_gt(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key1,
+	const union xfs_btree_key	*key2)
+{
+	return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) > 0;
+}
+
+static inline bool
+xfs_btree_keycmp_eq(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key1,
+	const union xfs_btree_key	*key2)
+{
+	return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) == 0;
+}
+
+static inline bool
+xfs_btree_keycmp_le(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key1,
+	const union xfs_btree_key	*key2)
+{
+	return !xfs_btree_keycmp_gt(cur, key1, key2);
+}
+
+static inline bool
+xfs_btree_keycmp_ge(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key1,
+	const union xfs_btree_key	*key2)
+{
+	return !xfs_btree_keycmp_lt(cur, key1, key2);
+}
+
+static inline bool
+xfs_btree_keycmp_ne(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key1,
+	const union xfs_btree_key	*key2)
+{
+	return !xfs_btree_keycmp_eq(cur, key1, key2);
+}
+
+/* Masked key comparison helpers */
+static inline bool
+xfs_btree_masked_keycmp_lt(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key1,
+	const union xfs_btree_key	*key2,
+	const union xfs_btree_key	*mask)
+{
+	return cur->bc_ops->diff_two_keys(cur, key1, key2, mask) < 0;
+}
+
+static inline bool
+xfs_btree_masked_keycmp_gt(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key1,
+	const union xfs_btree_key	*key2,
+	const union xfs_btree_key	*mask)
+{
+	return cur->bc_ops->diff_two_keys(cur, key1, key2, mask) > 0;
+}
+
+static inline bool
+xfs_btree_masked_keycmp_ge(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key1,
+	const union xfs_btree_key	*key2,
+	const union xfs_btree_key	*mask)
+{
+	return !xfs_btree_masked_keycmp_lt(cur, key1, key2, mask);
+}
+
 /* Does this cursor point to the last block in the given level? */
 static inline bool
 xfs_btree_islastblock(
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 5a321b783398..bcfb6a4203cd 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -397,6 +397,7 @@ xfs_defer_cancel_list(
 		list_for_each_safe(pwi, n, &dfp->dfp_work) {
 			list_del(pwi);
 			dfp->dfp_count--;
+			trace_xfs_defer_cancel_item(mp, dfp, pwi);
 			ops->cancel_item(pwi);
 		}
 		ASSERT(dfp->dfp_count == 0);
@@ -476,6 +477,7 @@ xfs_defer_finish_one(
 	list_for_each_safe(li, n, &dfp->dfp_work) {
 		list_del(li);
 		dfp->dfp_count--;
+		trace_xfs_defer_finish_item(tp->t_mountp, dfp, li);
 		error = ops->finish_item(tp, dfp->dfp_done, li, &state);
 		if (error == -EAGAIN) {
 			int		ret;
@@ -623,7 +625,7 @@ xfs_defer_add(
 	struct list_head		*li)
 {
 	struct xfs_defer_pending	*dfp = NULL;
-	const struct xfs_defer_op_type	*ops;
+	const struct xfs_defer_op_type	*ops = defer_op_types[type];
 
 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
 	BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX);
@@ -636,7 +638,6 @@ xfs_defer_add(
 	if (!list_empty(&tp->t_dfops)) {
 		dfp = list_last_entry(&tp->t_dfops,
 				struct xfs_defer_pending, dfp_list);
-		ops = defer_op_types[dfp->dfp_type];
 		if (dfp->dfp_type != type ||
 		    (ops->max_items && dfp->dfp_count >= ops->max_items))
 			dfp = NULL;
@@ -653,6 +654,7 @@ xfs_defer_add(
 	}
 
 	list_add_tail(li, &dfp->dfp_work);
+	trace_xfs_defer_add_item(tp->t_mountp, dfp, li);
 	dfp->dfp_count++;
 }
 
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 92bac3373f1f..f5462fd582d5 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -64,7 +64,7 @@ xfs_ascii_ci_hashname(
 	int			i;
 
 	for (i = 0, hash = 0; i < name->len; i++)
-		hash = tolower(name->name[i]) ^ rol32(hash, 7);
+		hash = xfs_ascii_ci_xfrm(name->name[i]) ^ rol32(hash, 7);
 
 	return hash;
 }
@@ -85,7 +85,8 @@ xfs_ascii_ci_compname(
 	for (i = 0; i < len; i++) {
 		if (args->name[i] == name[i])
 			continue;
-		if (tolower(args->name[i]) != tolower(name[i]))
+		if (xfs_ascii_ci_xfrm(args->name[i]) !=
+		    xfs_ascii_ci_xfrm(name[i]))
 			return XFS_CMP_DIFFERENT;
 		result = XFS_CMP_CASE;
 	}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index dd39f17dd9a9..19af22a16c41 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -248,4 +248,35 @@ unsigned int xfs_dir3_data_end_offset(struct xfs_da_geometry *geo,
 		struct xfs_dir2_data_hdr *hdr);
 bool xfs_dir2_namecheck(const void *name, size_t length);
 
+/*
+ * The "ascii-ci" feature was created to speed up case-insensitive lookups for
+ * a Samba product.  Because of the inherent problems with CI and UTF-8
+ * encoding, etc, it was decided that Samba would be configured to export
+ * latin1/iso 8859-1 encodings as that covered >90% of the target markets for
+ * the product.  Hence the "ascii-ci" casefolding code could be encoded into
+ * the XFS directory operations and remove all the overhead of casefolding from
+ * Samba.
+ *
+ * To provide consistent hashing behavior between the userspace and kernel,
+ * these functions prepare names for hashing by transforming specific bytes
+ * to other bytes.  Robustness with other encodings is not guaranteed.
+ */
+static inline bool xfs_ascii_ci_need_xfrm(unsigned char c)
+{
+	if (c >= 0x41 && c <= 0x5a)	/* A-Z */
+		return true;
+	if (c >= 0xc0 && c <= 0xd6)	/* latin A-O with accents */
+		return true;
+	if (c >= 0xd8 && c <= 0xde)	/* latin O-Y with accents */
+		return true;
+	return false;
+}
+
+static inline unsigned char xfs_ascii_ci_xfrm(unsigned char c)
+{
+	if (xfs_ascii_ci_need_xfrm(c))
+		c -= 'A' - 'a';
+	return c;
+}
+
 #endif	/* __XFS_DIR2_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 7ee292aecbeb..a16d5de16933 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -95,33 +95,25 @@ xfs_inobt_btrec_to_irec(
 	irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
 }
 
-/*
- * Get the data from the pointed-to record.
- */
-int
-xfs_inobt_get_rec(
-	struct xfs_btree_cur		*cur,
-	struct xfs_inobt_rec_incore	*irec,
-	int				*stat)
+/* Simple checks for inode records. */
+xfs_failaddr_t
+xfs_inobt_check_irec(
+	struct xfs_btree_cur			*cur,
+	const struct xfs_inobt_rec_incore	*irec)
 {
-	struct xfs_mount		*mp = cur->bc_mp;
-	union xfs_btree_rec		*rec;
-	int				error;
 	uint64_t			realfree;
 
-	error = xfs_btree_get_rec(cur, &rec, stat);
-	if (error || *stat == 0)
-		return error;
-
-	xfs_inobt_btrec_to_irec(mp, rec, irec);
-
+	/* Record has to be properly aligned within the AG. */
 	if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino))
-		goto out_bad_rec;
+		return __this_address;
+	if (!xfs_verify_agino(cur->bc_ag.pag,
+				irec->ir_startino + XFS_INODES_PER_CHUNK - 1))
+		return __this_address;
 	if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT ||
 	    irec->ir_count > XFS_INODES_PER_CHUNK)
-		goto out_bad_rec;
+		return __this_address;
 	if (irec->ir_freecount > XFS_INODES_PER_CHUNK)
-		goto out_bad_rec;
+		return __this_address;
 
 	/* if there are no holes, return the first available offset */
 	if (!xfs_inobt_issparse(irec->ir_holemask))
@@ -129,15 +121,23 @@ xfs_inobt_get_rec(
 	else
 		realfree = irec->ir_free & xfs_inobt_irec_to_allocmask(irec);
 	if (hweight64(realfree) != irec->ir_freecount)
-		goto out_bad_rec;
+		return __this_address;
 
-	return 0;
+	return NULL;
+}
+
+static inline int
+xfs_inobt_complain_bad_rec(
+	struct xfs_btree_cur		*cur,
+	xfs_failaddr_t			fa,
+	const struct xfs_inobt_rec_incore *irec)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
 
-out_bad_rec:
 	xfs_warn(mp,
-		"%s Inode BTree record corruption in AG %d detected!",
+		"%s Inode BTree record corruption in AG %d detected at %pS!",
 		cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free",
-		cur->bc_ag.pag->pag_agno);
+		cur->bc_ag.pag->pag_agno, fa);
 	xfs_warn(mp,
 "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x",
 		irec->ir_startino, irec->ir_count, irec->ir_freecount,
@@ -146,6 +146,32 @@ out_bad_rec:
 }
 
 /*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_inobt_get_rec(
+	struct xfs_btree_cur		*cur,
+	struct xfs_inobt_rec_incore	*irec,
+	int				*stat)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+	union xfs_btree_rec		*rec;
+	xfs_failaddr_t			fa;
+	int				error;
+
+	error = xfs_btree_get_rec(cur, &rec, stat);
+	if (error || *stat == 0)
+		return error;
+
+	xfs_inobt_btrec_to_irec(mp, rec, irec);
+	fa = xfs_inobt_check_irec(cur, irec);
+	if (fa)
+		return xfs_inobt_complain_bad_rec(cur, fa, irec);
+
+	return 0;
+}
+
+/*
  * Insert a single inobt record. Cursor must already point to desired location.
  */
 int
@@ -1952,8 +1978,6 @@ xfs_difree_inobt(
 	 */
 	if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
 	    mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
-		struct xfs_perag	*pag = agbp->b_pag;
-
 		xic->deleted = true;
 		xic->first_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
 				rec.ir_startino);
@@ -2617,44 +2641,50 @@ xfs_ialloc_read_agi(
 	return 0;
 }
 
-/* Is there an inode record covering a given range of inode numbers? */
-int
-xfs_ialloc_has_inode_record(
-	struct xfs_btree_cur	*cur,
-	xfs_agino_t		low,
-	xfs_agino_t		high,
-	bool			*exists)
+/* How many inodes are backed by inode clusters ondisk? */
+STATIC int
+xfs_ialloc_count_ondisk(
+	struct xfs_btree_cur		*cur,
+	xfs_agino_t			low,
+	xfs_agino_t			high,
+	unsigned int			*allocated)
 {
 	struct xfs_inobt_rec_incore	irec;
-	xfs_agino_t		agino;
-	uint16_t		holemask;
-	int			has_record;
-	int			i;
-	int			error;
+	unsigned int			ret = 0;
+	int				has_record;
+	int				error;
 
-	*exists = false;
 	error = xfs_inobt_lookup(cur, low, XFS_LOOKUP_LE, &has_record);
-	while (error == 0 && has_record) {
+	if (error)
+		return error;
+
+	while (has_record) {
+		unsigned int		i, hole_idx;
+
 		error = xfs_inobt_get_rec(cur, &irec, &has_record);
-		if (error || irec.ir_startino > high)
+		if (error)
+			return error;
+		if (irec.ir_startino > high)
 			break;
 
-		agino = irec.ir_startino;
-		holemask = irec.ir_holemask;
-		for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; holemask >>= 1,
-				i++, agino += XFS_INODES_PER_HOLEMASK_BIT) {
-			if (holemask & 1)
+		for (i = 0; i < XFS_INODES_PER_CHUNK; i++) {
+			if (irec.ir_startino + i < low)
 				continue;
-			if (agino + XFS_INODES_PER_HOLEMASK_BIT > low &&
-					agino <= high) {
-				*exists = true;
-				return 0;
-			}
+			if (irec.ir_startino + i > high)
+				break;
+
+			hole_idx = i / XFS_INODES_PER_HOLEMASK_BIT;
+			if (!(irec.ir_holemask & (1U << hole_idx)))
+				ret++;
 		}
 
 		error = xfs_btree_increment(cur, 0, &has_record);
+		if (error)
+			return error;
 	}
-	return error;
+
+	*allocated = ret;
+	return 0;
 }
 
 /* Is there an inode record covering a given extent? */
@@ -2663,15 +2693,27 @@ xfs_ialloc_has_inodes_at_extent(
 	struct xfs_btree_cur	*cur,
 	xfs_agblock_t		bno,
 	xfs_extlen_t		len,
-	bool			*exists)
+	enum xbtree_recpacking	*outcome)
 {
-	xfs_agino_t		low;
-	xfs_agino_t		high;
+	xfs_agino_t		agino;
+	xfs_agino_t		last_agino;
+	unsigned int		allocated;
+	int			error;
+
+	agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno);
+	last_agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1;
 
-	low = XFS_AGB_TO_AGINO(cur->bc_mp, bno);
-	high = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1;
+	error = xfs_ialloc_count_ondisk(cur, agino, last_agino, &allocated);
+	if (error)
+		return error;
 
-	return xfs_ialloc_has_inode_record(cur, low, high, exists);
+	if (allocated == 0)
+		*outcome = XBTREE_RECPACKING_EMPTY;
+	else if (allocated == last_agino - agino + 1)
+		*outcome = XBTREE_RECPACKING_FULL;
+	else
+		*outcome = XBTREE_RECPACKING_SPARSE;
+	return 0;
 }
 
 struct xfs_ialloc_count_inodes {
@@ -2688,8 +2730,13 @@ xfs_ialloc_count_inodes_rec(
 {
 	struct xfs_inobt_rec_incore	irec;
 	struct xfs_ialloc_count_inodes	*ci = priv;
+	xfs_failaddr_t			fa;
 
 	xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec);
+	fa = xfs_inobt_check_irec(cur, &irec);
+	if (fa)
+		return xfs_inobt_complain_bad_rec(cur, fa, &irec);
+
 	ci->count += irec.ir_count;
 	ci->freecount += irec.ir_freecount;
 
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index ab8c30b4ec22..fe824bb04a09 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -93,10 +93,11 @@ union xfs_btree_rec;
 void xfs_inobt_btrec_to_irec(struct xfs_mount *mp,
 		const union xfs_btree_rec *rec,
 		struct xfs_inobt_rec_incore *irec);
+xfs_failaddr_t xfs_inobt_check_irec(struct xfs_btree_cur *cur,
+		const struct xfs_inobt_rec_incore *irec);
 int xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur *cur,
-		xfs_agblock_t bno, xfs_extlen_t len, bool *exists);
-int xfs_ialloc_has_inode_record(struct xfs_btree_cur *cur, xfs_agino_t low,
-		xfs_agino_t high, bool *exists);
+		xfs_agblock_t bno, xfs_extlen_t len,
+		enum xbtree_recpacking *outcome);
 int xfs_ialloc_count_inodes(struct xfs_btree_cur *cur, xfs_agino_t *count,
 		xfs_agino_t *freecount);
 int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask,
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 9b28211d5a4c..5a945ae21b5d 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -156,9 +156,12 @@ __xfs_inobt_free_block(
 	struct xfs_buf		*bp,
 	enum xfs_ag_resv_type	resv)
 {
+	xfs_fsblock_t		fsbno;
+
 	xfs_inobt_mod_blockcount(cur, -1);
-	return xfs_free_extent(cur->bc_tp,
-			XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)), 1,
+	fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
+	return xfs_free_extent(cur->bc_tp, cur->bc_ag.pag,
+			XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1,
 			&XFS_RMAP_OINFO_INOBT, resv);
 }
 
@@ -266,10 +269,13 @@ STATIC int64_t
 xfs_inobt_diff_two_keys(
 	struct xfs_btree_cur		*cur,
 	const union xfs_btree_key	*k1,
-	const union xfs_btree_key	*k2)
+	const union xfs_btree_key	*k2,
+	const union xfs_btree_key	*mask)
 {
+	ASSERT(!mask || mask->inobt.ir_startino);
+
 	return (int64_t)be32_to_cpu(k1->inobt.ir_startino) -
-			  be32_to_cpu(k2->inobt.ir_startino);
+			be32_to_cpu(k2->inobt.ir_startino);
 }
 
 static xfs_failaddr_t
@@ -380,6 +386,19 @@ xfs_inobt_recs_inorder(
 		be32_to_cpu(r2->inobt.ir_startino);
 }
 
+STATIC enum xbtree_key_contig
+xfs_inobt_keys_contiguous(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key1,
+	const union xfs_btree_key	*key2,
+	const union xfs_btree_key	*mask)
+{
+	ASSERT(!mask || mask->inobt.ir_startino);
+
+	return xbtree_key_contig(be32_to_cpu(key1->inobt.ir_startino),
+				 be32_to_cpu(key2->inobt.ir_startino));
+}
+
 static const struct xfs_btree_ops xfs_inobt_ops = {
 	.rec_len		= sizeof(xfs_inobt_rec_t),
 	.key_len		= sizeof(xfs_inobt_key_t),
@@ -399,6 +418,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
 	.diff_two_keys		= xfs_inobt_diff_two_keys,
 	.keys_inorder		= xfs_inobt_keys_inorder,
 	.recs_inorder		= xfs_inobt_recs_inorder,
+	.keys_contiguous	= xfs_inobt_keys_contiguous,
 };
 
 static const struct xfs_btree_ops xfs_finobt_ops = {
@@ -420,6 +440,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
 	.diff_two_keys		= xfs_inobt_diff_two_keys,
 	.keys_inorder		= xfs_inobt_keys_inorder,
 	.recs_inorder		= xfs_inobt_recs_inorder,
+	.keys_contiguous	= xfs_inobt_keys_contiguous,
 };
 
 /*
@@ -447,9 +468,7 @@ xfs_inobt_init_common(
 	if (xfs_has_crc(mp))
 		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
 
-	/* take a reference for the cursor */
-	atomic_inc(&pag->pag_ref);
-	cur->bc_ag.pag = pag;
+	cur->bc_ag.pag = xfs_perag_hold(pag);
 	return cur;
 }
 
@@ -607,7 +626,7 @@ xfs_iallocbt_maxlevels_ondisk(void)
  */
 uint64_t
 xfs_inobt_irec_to_allocmask(
-	struct xfs_inobt_rec_incore	*rec)
+	const struct xfs_inobt_rec_incore	*rec)
 {
 	uint64_t			bitmap = 0;
 	uint64_t			inodespbit;
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index e859a6e05230..3262c3fe5ebe 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -53,7 +53,7 @@ struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_perag *pag,
 extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
 
 /* ir_holemask to inode allocation bitmap conversion */
-uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *);
+uint64_t xfs_inobt_irec_to_allocmask(const struct xfs_inobt_rec_incore *irec);
 
 #if defined(DEBUG) || defined(XFS_WARN)
 int xfs_inobt_rec_check_count(struct xfs_mount *,
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 6b21760184d9..5a2e7ddfa76d 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -140,7 +140,8 @@ xfs_iformat_extents(
 				xfs_inode_verifier_error(ip, -EFSCORRUPTED,
 						"xfs_iformat_extents(2)",
 						dp, sizeof(*dp), fa);
-				return -EFSCORRUPTED;
+				return xfs_bmap_complain_bad_rec(ip, whichfork,
+						fa, &new);
 			}
 
 			xfs_iext_insert(ip, &icur, &new, state);
@@ -226,10 +227,15 @@ xfs_iformat_data_fork(
 
 	/*
 	 * Initialize the extent count early, as the per-format routines may
-	 * depend on it.
+	 * depend on it.  Use release semantics to set needextents /after/ we
+	 * set the format. This ensures that we can use acquire semantics on
+	 * needextents in xfs_need_iread_extents() and be guaranteed to see a
+	 * valid format value after that load.
 	 */
 	ip->i_df.if_format = dip->di_format;
 	ip->i_df.if_nextents = xfs_dfork_data_extents(dip);
+	smp_store_release(&ip->i_df.if_needextents,
+			   ip->i_df.if_format == XFS_DINODE_FMT_BTREE ? 1 : 0);
 
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFIFO:
@@ -282,8 +288,17 @@ xfs_ifork_init_attr(
 	enum xfs_dinode_fmt	format,
 	xfs_extnum_t		nextents)
 {
+	/*
+	 * Initialize the extent count early, as the per-format routines may
+	 * depend on it.  Use release semantics to set needextents /after/ we
+	 * set the format. This ensures that we can use acquire semantics on
+	 * needextents in xfs_need_iread_extents() and be guaranteed to see a
+	 * valid format value after that load.
+	 */
 	ip->i_af.if_format = format;
 	ip->i_af.if_nextents = nextents;
+	smp_store_release(&ip->i_af.if_needextents,
+			   ip->i_af.if_format == XFS_DINODE_FMT_BTREE ? 1 : 0);
 }
 
 void
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index d3943d6ad0b9..96d307784c85 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -24,6 +24,7 @@ struct xfs_ifork {
 	xfs_extnum_t		if_nextents;	/* # of extents in this fork */
 	short			if_broot_bytes;	/* bytes allocated for root */
 	int8_t			if_format;	/* format of this fork */
+	uint8_t			if_needextents;	/* extents have not been read */
 };
 
 /*
@@ -260,9 +261,10 @@ int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip,
 		uint nr_to_add);
 
 /* returns true if the fork has extents but they are not read in yet. */
-static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp)
+static inline bool xfs_need_iread_extents(const struct xfs_ifork *ifp)
 {
-	return ifp->if_format == XFS_DINODE_FMT_BTREE && ifp->if_height == 0;
+	/* see xfs_iformat_{data,attr}_fork() for needextents semantics */
+	return smp_load_acquire(&ifp->if_needextents) != 0;
 }
 
 #endif	/* __XFS_INODE_FORK_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index bcf46aa0d08b..c1c65774dcc2 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -120,45 +120,41 @@ xfs_refcount_btrec_to_irec(
 	irec->rc_refcount = be32_to_cpu(rec->refc.rc_refcount);
 }
 
-/*
- * Get the data from the pointed-to record.
- */
-int
-xfs_refcount_get_rec(
+/* Simple checks for refcount records. */
+xfs_failaddr_t
+xfs_refcount_check_irec(
 	struct xfs_btree_cur		*cur,
-	struct xfs_refcount_irec	*irec,
-	int				*stat)
+	const struct xfs_refcount_irec	*irec)
 {
-	struct xfs_mount		*mp = cur->bc_mp;
 	struct xfs_perag		*pag = cur->bc_ag.pag;
-	union xfs_btree_rec		*rec;
-	int				error;
-
-	error = xfs_btree_get_rec(cur, &rec, stat);
-	if (error || !*stat)
-		return error;
 
-	xfs_refcount_btrec_to_irec(rec, irec);
 	if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN)
-		goto out_bad_rec;
+		return __this_address;
 
 	if (!xfs_refcount_check_domain(irec))
-		goto out_bad_rec;
+		return __this_address;
 
 	/* check for valid extent range, including overflow */
 	if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount))
-		goto out_bad_rec;
+		return __this_address;
 
 	if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT)
-		goto out_bad_rec;
+		return __this_address;
 
-	trace_xfs_refcount_get(cur->bc_mp, pag->pag_agno, irec);
-	return 0;
+	return NULL;
+}
+
+static inline int
+xfs_refcount_complain_bad_rec(
+	struct xfs_btree_cur		*cur,
+	xfs_failaddr_t			fa,
+	const struct xfs_refcount_irec	*irec)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
 
-out_bad_rec:
 	xfs_warn(mp,
-		"Refcount BTree record corruption in AG %d detected!",
-		pag->pag_agno);
+ "Refcount BTree record corruption in AG %d detected at %pS!",
+				cur->bc_ag.pag->pag_agno, fa);
 	xfs_warn(mp,
 		"Start block 0x%x, block count 0x%x, references 0x%x",
 		irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount);
@@ -166,6 +162,32 @@ out_bad_rec:
 }
 
 /*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_refcount_get_rec(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*irec,
+	int				*stat)
+{
+	union xfs_btree_rec		*rec;
+	xfs_failaddr_t			fa;
+	int				error;
+
+	error = xfs_btree_get_rec(cur, &rec, stat);
+	if (error || !*stat)
+		return error;
+
+	xfs_refcount_btrec_to_irec(rec, irec);
+	fa = xfs_refcount_check_irec(cur, irec);
+	if (fa)
+		return xfs_refcount_complain_bad_rec(cur, fa, irec);
+
+	trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
+	return 0;
+}
+
+/*
  * Update the record referred to by cur to the value given
  * by [bno, len, refcount].
  * This either works (return 0) or gets an EFSCORRUPTED error.
@@ -1332,26 +1354,22 @@ xfs_refcount_finish_one(
 	xfs_agblock_t			bno;
 	unsigned long			nr_ops = 0;
 	int				shape_changes = 0;
-	struct xfs_perag		*pag;
 
-	pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock));
 	bno = XFS_FSB_TO_AGBNO(mp, ri->ri_startblock);
 
 	trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock),
 			ri->ri_type, XFS_FSB_TO_AGBNO(mp, ri->ri_startblock),
 			ri->ri_blockcount);
 
-	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) {
-		error = -EIO;
-		goto out_drop;
-	}
+	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
+		return -EIO;
 
 	/*
 	 * If we haven't gotten a cursor or the cursor AG doesn't match
 	 * the startblock, get one now.
 	 */
 	rcur = *pcur;
-	if (rcur != NULL && rcur->bc_ag.pag != pag) {
+	if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) {
 		nr_ops = rcur->bc_ag.refc.nr_ops;
 		shape_changes = rcur->bc_ag.refc.shape_changes;
 		xfs_refcount_finish_one_cleanup(tp, rcur, 0);
@@ -1359,12 +1377,12 @@ xfs_refcount_finish_one(
 		*pcur = NULL;
 	}
 	if (rcur == NULL) {
-		error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_FREEING,
-				&agbp);
+		error = xfs_alloc_read_agf(ri->ri_pag, tp,
+				XFS_ALLOC_FLAG_FREEING, &agbp);
 		if (error)
-			goto out_drop;
+			return error;
 
-		rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
+		rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, ri->ri_pag);
 		rcur->bc_ag.refc.nr_ops = nr_ops;
 		rcur->bc_ag.refc.shape_changes = shape_changes;
 	}
@@ -1375,7 +1393,7 @@ xfs_refcount_finish_one(
 		error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount,
 				XFS_REFCOUNT_ADJUST_INCREASE);
 		if (error)
-			goto out_drop;
+			return error;
 		if (ri->ri_blockcount > 0)
 			error = xfs_refcount_continue_op(rcur, ri, bno);
 		break;
@@ -1383,31 +1401,29 @@ xfs_refcount_finish_one(
 		error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount,
 				XFS_REFCOUNT_ADJUST_DECREASE);
 		if (error)
-			goto out_drop;
+			return error;
 		if (ri->ri_blockcount > 0)
 			error = xfs_refcount_continue_op(rcur, ri, bno);
 		break;
 	case XFS_REFCOUNT_ALLOC_COW:
 		error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount);
 		if (error)
-			goto out_drop;
+			return error;
 		ri->ri_blockcount = 0;
 		break;
 	case XFS_REFCOUNT_FREE_COW:
 		error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount);
 		if (error)
-			goto out_drop;
+			return error;
 		ri->ri_blockcount = 0;
 		break;
 	default:
 		ASSERT(0);
-		error = -EFSCORRUPTED;
+		return -EFSCORRUPTED;
 	}
 	if (!error && ri->ri_blockcount > 0)
-		trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno,
+		trace_xfs_refcount_finish_one_leftover(mp, ri->ri_pag->pag_agno,
 				ri->ri_type, bno, ri->ri_blockcount);
-out_drop:
-	xfs_perag_put(pag);
 	return error;
 }
 
@@ -1435,6 +1451,7 @@ __xfs_refcount_add(
 	ri->ri_startblock = startblock;
 	ri->ri_blockcount = blockcount;
 
+	xfs_refcount_update_get_group(tp->t_mountp, ri);
 	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list);
 }
 
@@ -1876,7 +1893,8 @@ xfs_refcount_recover_extent(
 	INIT_LIST_HEAD(&rr->rr_list);
 	xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
 
-	if (XFS_IS_CORRUPT(cur->bc_mp,
+	if (xfs_refcount_check_irec(cur, &rr->rr_rrec) != NULL ||
+	    XFS_IS_CORRUPT(cur->bc_mp,
 			   rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) {
 		kfree(rr);
 		return -EFSCORRUPTED;
@@ -1980,14 +1998,17 @@ out_free:
 	return error;
 }
 
-/* Is there a record covering a given extent? */
+/*
+ * Scan part of the keyspace of the refcount records and tell us if the area
+ * has no records, is fully mapped by records, or is partially filled.
+ */
 int
-xfs_refcount_has_record(
+xfs_refcount_has_records(
 	struct xfs_btree_cur	*cur,
 	enum xfs_refc_domain	domain,
 	xfs_agblock_t		bno,
 	xfs_extlen_t		len,
-	bool			*exists)
+	enum xbtree_recpacking	*outcome)
 {
 	union xfs_btree_irec	low;
 	union xfs_btree_irec	high;
@@ -1998,7 +2019,7 @@ xfs_refcount_has_record(
 	high.rc.rc_startblock = bno + len - 1;
 	low.rc.rc_domain = high.rc.rc_domain = domain;
 
-	return xfs_btree_has_record(cur, &low, &high, exists);
+	return xfs_btree_has_records(cur, &low, &high, NULL, outcome);
 }
 
 int __init
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index c633477ce3ce..783cd89ca195 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -50,6 +50,7 @@ enum xfs_refcount_intent_type {
 
 struct xfs_refcount_intent {
 	struct list_head			ri_list;
+	struct xfs_perag			*ri_pag;
 	enum xfs_refcount_intent_type		ri_type;
 	xfs_extlen_t				ri_blockcount;
 	xfs_fsblock_t				ri_startblock;
@@ -67,6 +68,9 @@ xfs_refcount_check_domain(
 	return true;
 }
 
+void xfs_refcount_update_get_group(struct xfs_mount *mp,
+		struct xfs_refcount_intent *ri);
+
 void xfs_refcount_increase_extent(struct xfs_trans *tp,
 		struct xfs_bmbt_irec *irec);
 void xfs_refcount_decrease_extent(struct xfs_trans *tp,
@@ -107,12 +111,14 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
  */
 #define XFS_REFCOUNT_ITEM_OVERHEAD	32
 
-extern int xfs_refcount_has_record(struct xfs_btree_cur *cur,
+extern int xfs_refcount_has_records(struct xfs_btree_cur *cur,
 		enum xfs_refc_domain domain, xfs_agblock_t bno,
-		xfs_extlen_t len, bool *exists);
+		xfs_extlen_t len, enum xbtree_recpacking *outcome);
 union xfs_btree_rec;
 extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec,
 		struct xfs_refcount_irec *irec);
+xfs_failaddr_t xfs_refcount_check_irec(struct xfs_btree_cur *cur,
+		const struct xfs_refcount_irec *irec);
 extern int xfs_refcount_insert(struct xfs_btree_cur *cur,
 		struct xfs_refcount_irec *irec, int *stat);
 
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index f3b860970b26..d4afc5f4e6a5 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -112,8 +112,9 @@ xfs_refcountbt_free_block(
 			XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1);
 	be32_add_cpu(&agf->agf_refcount_blocks, -1);
 	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
-	error = xfs_free_extent(cur->bc_tp, fsbno, 1, &XFS_RMAP_OINFO_REFC,
-			XFS_AG_RESV_METADATA);
+	error = xfs_free_extent(cur->bc_tp, cur->bc_ag.pag,
+			XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1,
+			&XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA);
 	if (error)
 		return error;
 
@@ -201,10 +202,13 @@ STATIC int64_t
 xfs_refcountbt_diff_two_keys(
 	struct xfs_btree_cur		*cur,
 	const union xfs_btree_key	*k1,
-	const union xfs_btree_key	*k2)
+	const union xfs_btree_key	*k2,
+	const union xfs_btree_key	*mask)
 {
+	ASSERT(!mask || mask->refc.rc_startblock);
+
 	return (int64_t)be32_to_cpu(k1->refc.rc_startblock) -
-			  be32_to_cpu(k2->refc.rc_startblock);
+			be32_to_cpu(k2->refc.rc_startblock);
 }
 
 STATIC xfs_failaddr_t
@@ -299,6 +303,19 @@ xfs_refcountbt_recs_inorder(
 		be32_to_cpu(r2->refc.rc_startblock);
 }
 
+STATIC enum xbtree_key_contig
+xfs_refcountbt_keys_contiguous(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key1,
+	const union xfs_btree_key	*key2,
+	const union xfs_btree_key	*mask)
+{
+	ASSERT(!mask || mask->refc.rc_startblock);
+
+	return xbtree_key_contig(be32_to_cpu(key1->refc.rc_startblock),
+				 be32_to_cpu(key2->refc.rc_startblock));
+}
+
 static const struct xfs_btree_ops xfs_refcountbt_ops = {
 	.rec_len		= sizeof(struct xfs_refcount_rec),
 	.key_len		= sizeof(struct xfs_refcount_key),
@@ -318,6 +335,7 @@ static const struct xfs_btree_ops xfs_refcountbt_ops = {
 	.diff_two_keys		= xfs_refcountbt_diff_two_keys,
 	.keys_inorder		= xfs_refcountbt_keys_inorder,
 	.recs_inorder		= xfs_refcountbt_recs_inorder,
+	.keys_contiguous	= xfs_refcountbt_keys_contiguous,
 };
 
 /*
@@ -339,10 +357,7 @@ xfs_refcountbt_init_common(
 
 	cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
 
-	/* take a reference for the cursor */
-	atomic_inc(&pag->pag_ref);
-	cur->bc_ag.pag = pag;
-
+	cur->bc_ag.pag = xfs_perag_hold(pag);
 	cur->bc_ag.refc.nr_ops = 0;
 	cur->bc_ag.refc.shape_changes = 0;
 	cur->bc_ops = &xfs_refcountbt_ops;
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index df720041cd3d..f4dc23b3b837 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -193,7 +193,7 @@ done:
 }
 
 /* Convert an internal btree record to an rmap record. */
-int
+xfs_failaddr_t
 xfs_rmap_btrec_to_irec(
 	const union xfs_btree_rec	*rec,
 	struct xfs_rmap_irec		*irec)
@@ -205,51 +205,74 @@ xfs_rmap_btrec_to_irec(
 			irec);
 }
 
-/*
- * Get the data from the pointed-to record.
- */
-int
-xfs_rmap_get_rec(
-	struct xfs_btree_cur	*cur,
-	struct xfs_rmap_irec	*irec,
-	int			*stat)
+/* Simple checks for rmap records. */
+xfs_failaddr_t
+xfs_rmap_check_irec(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*irec)
 {
-	struct xfs_mount	*mp = cur->bc_mp;
-	struct xfs_perag	*pag = cur->bc_ag.pag;
-	union xfs_btree_rec	*rec;
-	int			error;
-
-	error = xfs_btree_get_rec(cur, &rec, stat);
-	if (error || !*stat)
-		return error;
-
-	if (xfs_rmap_btrec_to_irec(rec, irec))
-		goto out_bad_rec;
+	struct xfs_mount		*mp = cur->bc_mp;
+	bool				is_inode;
+	bool				is_unwritten;
+	bool				is_bmbt;
+	bool				is_attr;
 
 	if (irec->rm_blockcount == 0)
-		goto out_bad_rec;
+		return __this_address;
 	if (irec->rm_startblock <= XFS_AGFL_BLOCK(mp)) {
 		if (irec->rm_owner != XFS_RMAP_OWN_FS)
-			goto out_bad_rec;
+			return __this_address;
 		if (irec->rm_blockcount != XFS_AGFL_BLOCK(mp) + 1)
-			goto out_bad_rec;
+			return __this_address;
 	} else {
 		/* check for valid extent range, including overflow */
-		if (!xfs_verify_agbext(pag, irec->rm_startblock,
-					    irec->rm_blockcount))
-			goto out_bad_rec;
+		if (!xfs_verify_agbext(cur->bc_ag.pag, irec->rm_startblock,
+						       irec->rm_blockcount))
+			return __this_address;
 	}
 
 	if (!(xfs_verify_ino(mp, irec->rm_owner) ||
 	      (irec->rm_owner <= XFS_RMAP_OWN_FS &&
 	       irec->rm_owner >= XFS_RMAP_OWN_MIN)))
-		goto out_bad_rec;
+		return __this_address;
+
+	/* Check flags. */
+	is_inode = !XFS_RMAP_NON_INODE_OWNER(irec->rm_owner);
+	is_bmbt = irec->rm_flags & XFS_RMAP_BMBT_BLOCK;
+	is_attr = irec->rm_flags & XFS_RMAP_ATTR_FORK;
+	is_unwritten = irec->rm_flags & XFS_RMAP_UNWRITTEN;
+
+	if (is_bmbt && irec->rm_offset != 0)
+		return __this_address;
+
+	if (!is_inode && irec->rm_offset != 0)
+		return __this_address;
+
+	if (is_unwritten && (is_bmbt || !is_inode || is_attr))
+		return __this_address;
+
+	if (!is_inode && (is_bmbt || is_unwritten || is_attr))
+		return __this_address;
+
+	/* Check for a valid fork offset, if applicable. */
+	if (is_inode && !is_bmbt &&
+	    !xfs_verify_fileext(mp, irec->rm_offset, irec->rm_blockcount))
+		return __this_address;
+
+	return NULL;
+}
+
+static inline int
+xfs_rmap_complain_bad_rec(
+	struct xfs_btree_cur		*cur,
+	xfs_failaddr_t			fa,
+	const struct xfs_rmap_irec	*irec)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
 
-	return 0;
-out_bad_rec:
 	xfs_warn(mp,
-		"Reverse Mapping BTree record corruption in AG %d detected!",
-		pag->pag_agno);
+		"Reverse Mapping BTree record corruption in AG %d detected at %pS!",
+		cur->bc_ag.pag->pag_agno, fa);
 	xfs_warn(mp,
 		"Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x",
 		irec->rm_owner, irec->rm_flags, irec->rm_startblock,
@@ -257,6 +280,32 @@ out_bad_rec:
 	return -EFSCORRUPTED;
 }
 
+/*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_rmap_get_rec(
+	struct xfs_btree_cur	*cur,
+	struct xfs_rmap_irec	*irec,
+	int			*stat)
+{
+	union xfs_btree_rec	*rec;
+	xfs_failaddr_t		fa;
+	int			error;
+
+	error = xfs_btree_get_rec(cur, &rec, stat);
+	if (error || !*stat)
+		return error;
+
+	fa = xfs_rmap_btrec_to_irec(rec, irec);
+	if (!fa)
+		fa = xfs_rmap_check_irec(cur, irec);
+	if (fa)
+		return xfs_rmap_complain_bad_rec(cur, fa, irec);
+
+	return 0;
+}
+
 struct xfs_find_left_neighbor_info {
 	struct xfs_rmap_irec	high;
 	struct xfs_rmap_irec	*irec;
@@ -2320,11 +2369,14 @@ xfs_rmap_query_range_helper(
 {
 	struct xfs_rmap_query_range_info	*query = priv;
 	struct xfs_rmap_irec			irec;
-	int					error;
+	xfs_failaddr_t				fa;
+
+	fa = xfs_rmap_btrec_to_irec(rec, &irec);
+	if (!fa)
+		fa = xfs_rmap_check_irec(cur, &irec);
+	if (fa)
+		return xfs_rmap_complain_bad_rec(cur, fa, &irec);
 
-	error = xfs_rmap_btrec_to_irec(rec, &irec);
-	if (error)
-		return error;
 	return query->fn(cur, &irec, query->priv);
 }
 
@@ -2394,7 +2446,6 @@ xfs_rmap_finish_one(
 	struct xfs_btree_cur		**pcur)
 {
 	struct xfs_mount		*mp = tp->t_mountp;
-	struct xfs_perag		*pag;
 	struct xfs_btree_cur		*rcur;
 	struct xfs_buf			*agbp = NULL;
 	int				error = 0;
@@ -2402,26 +2453,22 @@ xfs_rmap_finish_one(
 	xfs_agblock_t			bno;
 	bool				unwritten;
 
-	pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock));
 	bno = XFS_FSB_TO_AGBNO(mp, ri->ri_bmap.br_startblock);
 
-	trace_xfs_rmap_deferred(mp, pag->pag_agno, ri->ri_type, bno,
+	trace_xfs_rmap_deferred(mp, ri->ri_pag->pag_agno, ri->ri_type, bno,
 			ri->ri_owner, ri->ri_whichfork,
 			ri->ri_bmap.br_startoff, ri->ri_bmap.br_blockcount,
 			ri->ri_bmap.br_state);
 
-	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) {
-		error = -EIO;
-		goto out_drop;
-	}
-
+	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE))
+		return -EIO;
 
 	/*
 	 * If we haven't gotten a cursor or the cursor AG doesn't match
 	 * the startblock, get one now.
 	 */
 	rcur = *pcur;
-	if (rcur != NULL && rcur->bc_ag.pag != pag) {
+	if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) {
 		xfs_rmap_finish_one_cleanup(tp, rcur, 0);
 		rcur = NULL;
 		*pcur = NULL;
@@ -2432,15 +2479,13 @@ xfs_rmap_finish_one(
 		 * rmapbt, because a shape change could cause us to
 		 * allocate blocks.
 		 */
-		error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
+		error = xfs_free_extent_fix_freelist(tp, ri->ri_pag, &agbp);
 		if (error)
-			goto out_drop;
-		if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) {
-			error = -EFSCORRUPTED;
-			goto out_drop;
-		}
+			return error;
+		if (XFS_IS_CORRUPT(tp->t_mountp, !agbp))
+			return -EFSCORRUPTED;
 
-		rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
+		rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag);
 	}
 	*pcur = rcur;
 
@@ -2480,8 +2525,7 @@ xfs_rmap_finish_one(
 		ASSERT(0);
 		error = -EFSCORRUPTED;
 	}
-out_drop:
-	xfs_perag_put(pag);
+
 	return error;
 }
 
@@ -2526,6 +2570,7 @@ __xfs_rmap_add(
 	ri->ri_whichfork = whichfork;
 	ri->ri_bmap = *bmap;
 
+	xfs_rmap_update_get_group(tp->t_mountp, ri);
 	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list);
 }
 
@@ -2664,14 +2709,21 @@ xfs_rmap_compare(
 		return 0;
 }
 
-/* Is there a record covering a given extent? */
+/*
+ * Scan the physical storage part of the keyspace of the reverse mapping index
+ * and tell us if the area has no records, is fully mapped by records, or is
+ * partially filled.
+ */
 int
-xfs_rmap_has_record(
+xfs_rmap_has_records(
 	struct xfs_btree_cur	*cur,
 	xfs_agblock_t		bno,
 	xfs_extlen_t		len,
-	bool			*exists)
+	enum xbtree_recpacking	*outcome)
 {
+	union xfs_btree_key	mask = {
+		.rmap.rm_startblock = cpu_to_be32(-1U),
+	};
 	union xfs_btree_irec	low;
 	union xfs_btree_irec	high;
 
@@ -2680,68 +2732,144 @@ xfs_rmap_has_record(
 	memset(&high, 0xFF, sizeof(high));
 	high.r.rm_startblock = bno + len - 1;
 
-	return xfs_btree_has_record(cur, &low, &high, exists);
+	return xfs_btree_has_records(cur, &low, &high, &mask, outcome);
 }
 
-/*
- * Is there a record for this owner completely covering a given physical
- * extent?  If so, *has_rmap will be set to true.  If there is no record
- * or the record only covers part of the range, we set *has_rmap to false.
- * This function doesn't perform range lookups or offset checks, so it is
- * not suitable for checking data fork blocks.
- */
-int
-xfs_rmap_record_exists(
-	struct xfs_btree_cur		*cur,
+struct xfs_rmap_ownercount {
+	/* Owner that we're looking for. */
+	struct xfs_rmap_irec	good;
+
+	/* rmap search keys */
+	struct xfs_rmap_irec	low;
+	struct xfs_rmap_irec	high;
+
+	struct xfs_rmap_matches	*results;
+
+	/* Stop early if we find a nonmatch? */
+	bool			stop_on_nonmatch;
+};
+
+/* Does this rmap represent space that can have multiple owners? */
+static inline bool
+xfs_rmap_shareable(
+	struct xfs_mount		*mp,
+	const struct xfs_rmap_irec	*rmap)
+{
+	if (!xfs_has_reflink(mp))
+		return false;
+	if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
+		return false;
+	if (rmap->rm_flags & (XFS_RMAP_ATTR_FORK |
+			      XFS_RMAP_BMBT_BLOCK))
+		return false;
+	return true;
+}
+
+static inline void
+xfs_rmap_ownercount_init(
+	struct xfs_rmap_ownercount	*roc,
 	xfs_agblock_t			bno,
 	xfs_extlen_t			len,
 	const struct xfs_owner_info	*oinfo,
-	bool				*has_rmap)
+	struct xfs_rmap_matches		*results)
 {
-	uint64_t			owner;
-	uint64_t			offset;
-	unsigned int			flags;
-	int				has_record;
-	struct xfs_rmap_irec		irec;
-	int				error;
+	memset(roc, 0, sizeof(*roc));
+	roc->results = results;
+
+	roc->low.rm_startblock = bno;
+	memset(&roc->high, 0xFF, sizeof(roc->high));
+	roc->high.rm_startblock = bno + len - 1;
+
+	memset(results, 0, sizeof(*results));
+	roc->good.rm_startblock = bno;
+	roc->good.rm_blockcount = len;
+	roc->good.rm_owner = oinfo->oi_owner;
+	roc->good.rm_offset = oinfo->oi_offset;
+	if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
+		roc->good.rm_flags |= XFS_RMAP_ATTR_FORK;
+	if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
+		roc->good.rm_flags |= XFS_RMAP_BMBT_BLOCK;
+}
 
-	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
-	ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) ||
-	       (flags & XFS_RMAP_BMBT_BLOCK));
+/* Figure out if this is a match for the owner. */
+STATIC int
+xfs_rmap_count_owners_helper(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xfs_rmap_ownercount	*roc = priv;
+	struct xfs_rmap_irec		check = *rec;
+	unsigned int			keyflags;
+	bool				filedata;
+	int64_t				delta;
+
+	filedata = !XFS_RMAP_NON_INODE_OWNER(check.rm_owner) &&
+		   !(check.rm_flags & XFS_RMAP_BMBT_BLOCK);
+
+	/* Trim the part of check that comes before the comparison range. */
+	delta = (int64_t)roc->good.rm_startblock - check.rm_startblock;
+	if (delta > 0) {
+		check.rm_startblock += delta;
+		check.rm_blockcount -= delta;
+		if (filedata)
+			check.rm_offset += delta;
+	}
 
-	error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &irec,
-			&has_record);
-	if (error)
-		return error;
-	if (!has_record) {
-		*has_rmap = false;
-		return 0;
+	/* Trim the part of check that comes after the comparison range. */
+	delta = (check.rm_startblock + check.rm_blockcount) -
+		(roc->good.rm_startblock + roc->good.rm_blockcount);
+	if (delta > 0)
+		check.rm_blockcount -= delta;
+
+	/* Don't care about unwritten status for establishing ownership. */
+	keyflags = check.rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK);
+
+	if (check.rm_startblock	== roc->good.rm_startblock &&
+	    check.rm_blockcount	== roc->good.rm_blockcount &&
+	    check.rm_owner	== roc->good.rm_owner &&
+	    check.rm_offset	== roc->good.rm_offset &&
+	    keyflags		== roc->good.rm_flags) {
+		roc->results->matches++;
+	} else {
+		roc->results->non_owner_matches++;
+		if (xfs_rmap_shareable(cur->bc_mp, &roc->good) ^
+		    xfs_rmap_shareable(cur->bc_mp, &check))
+			roc->results->bad_non_owner_matches++;
 	}
 
-	*has_rmap = (irec.rm_owner == owner && irec.rm_startblock <= bno &&
-		     irec.rm_startblock + irec.rm_blockcount >= bno + len);
+	if (roc->results->non_owner_matches && roc->stop_on_nonmatch)
+		return -ECANCELED;
+
 	return 0;
 }
 
-struct xfs_rmap_key_state {
-	uint64_t			owner;
-	uint64_t			offset;
-	unsigned int			flags;
-};
-
-/* For each rmap given, figure out if it doesn't match the key we want. */
-STATIC int
-xfs_rmap_has_other_keys_helper(
+/* Count the number of owners and non-owners of this range of blocks. */
+int
+xfs_rmap_count_owners(
 	struct xfs_btree_cur		*cur,
-	const struct xfs_rmap_irec	*rec,
-	void				*priv)
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	const struct xfs_owner_info	*oinfo,
+	struct xfs_rmap_matches		*results)
 {
-	struct xfs_rmap_key_state	*rks = priv;
+	struct xfs_rmap_ownercount	roc;
+	int				error;
 
-	if (rks->owner == rec->rm_owner && rks->offset == rec->rm_offset &&
-	    ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags)
-		return 0;
-	return -ECANCELED;
+	xfs_rmap_ownercount_init(&roc, bno, len, oinfo, results);
+	error = xfs_rmap_query_range(cur, &roc.low, &roc.high,
+			xfs_rmap_count_owners_helper, &roc);
+	if (error)
+		return error;
+
+	/*
+	 * There can't be any non-owner rmaps that conflict with the given
+	 * owner if we didn't find any rmaps matching the owner.
+	 */
+	if (!results->matches)
+		results->bad_non_owner_matches = 0;
+
+	return 0;
 }
 
 /*
@@ -2754,28 +2882,26 @@ xfs_rmap_has_other_keys(
 	xfs_agblock_t			bno,
 	xfs_extlen_t			len,
 	const struct xfs_owner_info	*oinfo,
-	bool				*has_rmap)
+	bool				*has_other)
 {
-	struct xfs_rmap_irec		low = {0};
-	struct xfs_rmap_irec		high;
-	struct xfs_rmap_key_state	rks;
+	struct xfs_rmap_matches		res;
+	struct xfs_rmap_ownercount	roc;
 	int				error;
 
-	xfs_owner_info_unpack(oinfo, &rks.owner, &rks.offset, &rks.flags);
-	*has_rmap = false;
-
-	low.rm_startblock = bno;
-	memset(&high, 0xFF, sizeof(high));
-	high.rm_startblock = bno + len - 1;
+	xfs_rmap_ownercount_init(&roc, bno, len, oinfo, &res);
+	roc.stop_on_nonmatch = true;
 
-	error = xfs_rmap_query_range(cur, &low, &high,
-			xfs_rmap_has_other_keys_helper, &rks);
+	error = xfs_rmap_query_range(cur, &roc.low, &roc.high,
+			xfs_rmap_count_owners_helper, &roc);
 	if (error == -ECANCELED) {
-		*has_rmap = true;
+		*has_other = true;
 		return 0;
 	}
+	if (error)
+		return error;
 
-	return error;
+	*has_other = false;
+	return 0;
 }
 
 const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = {
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 2dac88cea28d..3c98d9d50afb 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -62,13 +62,14 @@ xfs_rmap_irec_offset_pack(
 	return x;
 }
 
-static inline int
+static inline xfs_failaddr_t
 xfs_rmap_irec_offset_unpack(
 	__u64			offset,
 	struct xfs_rmap_irec	*irec)
 {
 	if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS))
-		return -EFSCORRUPTED;
+		return __this_address;
+
 	irec->rm_offset = XFS_RMAP_OFF(offset);
 	irec->rm_flags = 0;
 	if (offset & XFS_RMAP_OFF_ATTR_FORK)
@@ -77,7 +78,7 @@ xfs_rmap_irec_offset_unpack(
 		irec->rm_flags |= XFS_RMAP_BMBT_BLOCK;
 	if (offset & XFS_RMAP_OFF_UNWRITTEN)
 		irec->rm_flags |= XFS_RMAP_UNWRITTEN;
-	return 0;
+	return NULL;
 }
 
 static inline void
@@ -162,8 +163,12 @@ struct xfs_rmap_intent {
 	int					ri_whichfork;
 	uint64_t				ri_owner;
 	struct xfs_bmbt_irec			ri_bmap;
+	struct xfs_perag			*ri_pag;
 };
 
+void xfs_rmap_update_get_group(struct xfs_mount *mp,
+		struct xfs_rmap_intent *ri);
+
 /* functions for updating the rmapbt based on bmbt map/unmap operations */
 void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 		int whichfork, struct xfs_bmbt_irec *imap);
@@ -188,16 +193,31 @@ int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
 int xfs_rmap_compare(const struct xfs_rmap_irec *a,
 		const struct xfs_rmap_irec *b);
 union xfs_btree_rec;
-int xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec,
+xfs_failaddr_t xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec,
 		struct xfs_rmap_irec *irec);
-int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno,
-		xfs_extlen_t len, bool *exists);
-int xfs_rmap_record_exists(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+xfs_failaddr_t xfs_rmap_check_irec(struct xfs_btree_cur *cur,
+		const struct xfs_rmap_irec *irec);
+
+int xfs_rmap_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+		xfs_extlen_t len, enum xbtree_recpacking *outcome);
+
+struct xfs_rmap_matches {
+	/* Number of owner matches. */
+	unsigned long long	matches;
+
+	/* Number of non-owner matches. */
+	unsigned long long	non_owner_matches;
+
+	/* Number of non-owner matches that conflict with the owner matches. */
+	unsigned long long	bad_non_owner_matches;
+};
+
+int xfs_rmap_count_owners(struct xfs_btree_cur *cur, xfs_agblock_t bno,
 		xfs_extlen_t len, const struct xfs_owner_info *oinfo,
-		bool *has_rmap);
+		struct xfs_rmap_matches *rmatch);
 int xfs_rmap_has_other_keys(struct xfs_btree_cur *cur, xfs_agblock_t bno,
 		xfs_extlen_t len, const struct xfs_owner_info *oinfo,
-		bool *has_rmap);
+		bool *has_other);
 int xfs_rmap_map_raw(struct xfs_btree_cur *cur, struct xfs_rmap_irec *rmap);
 
 extern const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE;
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index d3285684bb5e..6c81b20e97d2 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -156,6 +156,16 @@ xfs_rmapbt_get_maxrecs(
 	return cur->bc_mp->m_rmap_mxr[level != 0];
 }
 
+/*
+ * Convert the ondisk record's offset field into the ondisk key's offset field.
+ * Fork and bmbt are significant parts of the rmap record key, but written
+ * status is merely a record attribute.
+ */
+static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec)
+{
+	return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN);
+}
+
 STATIC void
 xfs_rmapbt_init_key_from_rec(
 	union xfs_btree_key		*key,
@@ -163,7 +173,7 @@ xfs_rmapbt_init_key_from_rec(
 {
 	key->rmap.rm_startblock = rec->rmap.rm_startblock;
 	key->rmap.rm_owner = rec->rmap.rm_owner;
-	key->rmap.rm_offset = rec->rmap.rm_offset;
+	key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
 }
 
 /*
@@ -186,7 +196,7 @@ xfs_rmapbt_init_high_key_from_rec(
 	key->rmap.rm_startblock = rec->rmap.rm_startblock;
 	be32_add_cpu(&key->rmap.rm_startblock, adj);
 	key->rmap.rm_owner = rec->rmap.rm_owner;
-	key->rmap.rm_offset = rec->rmap.rm_offset;
+	key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
 	if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
 	    XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
 		return;
@@ -219,6 +229,16 @@ xfs_rmapbt_init_ptr_from_cur(
 	ptr->s = agf->agf_roots[cur->bc_btnum];
 }
 
+/*
+ * Mask the appropriate parts of the ondisk key field for a key comparison.
+ * Fork and bmbt are significant parts of the rmap record key, but written
+ * status is merely a record attribute.
+ */
+static inline uint64_t offset_keymask(uint64_t offset)
+{
+	return offset & ~XFS_RMAP_OFF_UNWRITTEN;
+}
+
 STATIC int64_t
 xfs_rmapbt_key_diff(
 	struct xfs_btree_cur		*cur,
@@ -240,8 +260,8 @@ xfs_rmapbt_key_diff(
 	else if (y > x)
 		return -1;
 
-	x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset));
-	y = rec->rm_offset;
+	x = offset_keymask(be64_to_cpu(kp->rm_offset));
+	y = offset_keymask(xfs_rmap_irec_offset_pack(rec));
 	if (x > y)
 		return 1;
 	else if (y > x)
@@ -253,31 +273,43 @@ STATIC int64_t
 xfs_rmapbt_diff_two_keys(
 	struct xfs_btree_cur		*cur,
 	const union xfs_btree_key	*k1,
-	const union xfs_btree_key	*k2)
+	const union xfs_btree_key	*k2,
+	const union xfs_btree_key	*mask)
 {
 	const struct xfs_rmap_key	*kp1 = &k1->rmap;
 	const struct xfs_rmap_key	*kp2 = &k2->rmap;
 	int64_t				d;
 	__u64				x, y;
 
+	/* Doesn't make sense to mask off the physical space part */
+	ASSERT(!mask || mask->rmap.rm_startblock);
+
 	d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
-		       be32_to_cpu(kp2->rm_startblock);
+		     be32_to_cpu(kp2->rm_startblock);
 	if (d)
 		return d;
 
-	x = be64_to_cpu(kp1->rm_owner);
-	y = be64_to_cpu(kp2->rm_owner);
-	if (x > y)
-		return 1;
-	else if (y > x)
-		return -1;
+	if (!mask || mask->rmap.rm_owner) {
+		x = be64_to_cpu(kp1->rm_owner);
+		y = be64_to_cpu(kp2->rm_owner);
+		if (x > y)
+			return 1;
+		else if (y > x)
+			return -1;
+	}
+
+	if (!mask || mask->rmap.rm_offset) {
+		/* Doesn't make sense to allow offset but not owner */
+		ASSERT(!mask || mask->rmap.rm_owner);
+
+		x = offset_keymask(be64_to_cpu(kp1->rm_offset));
+		y = offset_keymask(be64_to_cpu(kp2->rm_offset));
+		if (x > y)
+			return 1;
+		else if (y > x)
+			return -1;
+	}
 
-	x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset));
-	y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset));
-	if (x > y)
-		return 1;
-	else if (y > x)
-		return -1;
 	return 0;
 }
 
@@ -387,8 +419,8 @@ xfs_rmapbt_keys_inorder(
 		return 1;
 	else if (a > b)
 		return 0;
-	a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset));
-	b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset));
+	a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset));
+	b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset));
 	if (a <= b)
 		return 1;
 	return 0;
@@ -417,13 +449,33 @@ xfs_rmapbt_recs_inorder(
 		return 1;
 	else if (a > b)
 		return 0;
-	a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset));
-	b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset));
+	a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset));
+	b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset));
 	if (a <= b)
 		return 1;
 	return 0;
 }
 
+STATIC enum xbtree_key_contig
+xfs_rmapbt_keys_contiguous(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key1,
+	const union xfs_btree_key	*key2,
+	const union xfs_btree_key	*mask)
+{
+	ASSERT(!mask || mask->rmap.rm_startblock);
+
+	/*
+	 * We only support checking contiguity of the physical space component.
+	 * If any callers ever need more specificity than that, they'll have to
+	 * implement it here.
+	 */
+	ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset));
+
+	return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock),
+				 be32_to_cpu(key2->rmap.rm_startblock));
+}
+
 static const struct xfs_btree_ops xfs_rmapbt_ops = {
 	.rec_len		= sizeof(struct xfs_rmap_rec),
 	.key_len		= 2 * sizeof(struct xfs_rmap_key),
@@ -443,6 +495,7 @@ static const struct xfs_btree_ops xfs_rmapbt_ops = {
 	.diff_two_keys		= xfs_rmapbt_diff_two_keys,
 	.keys_inorder		= xfs_rmapbt_keys_inorder,
 	.recs_inorder		= xfs_rmapbt_recs_inorder,
+	.keys_contiguous	= xfs_rmapbt_keys_contiguous,
 };
 
 static struct xfs_btree_cur *
@@ -460,10 +513,7 @@ xfs_rmapbt_init_common(
 	cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
 	cur->bc_ops = &xfs_rmapbt_ops;
 
-	/* take a reference for the cursor */
-	atomic_inc(&pag->pag_ref);
-	cur->bc_ag.pag = pag;
-
+	cur->bc_ag.pag = xfs_perag_hold(pag);
 	return cur;
 }
 
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 99cc03a298e2..ba0f17bc1dc0 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -72,7 +72,8 @@ xfs_sb_validate_v5_features(
 }
 
 /*
- * We support all XFS versions newer than a v4 superblock with V2 directories.
+ * We current support XFS v5 formats with known features and v4 superblocks with
+ * at least V2 directories.
  */
 bool
 xfs_sb_good_version(
@@ -86,16 +87,16 @@ xfs_sb_good_version(
 	if (xfs_sb_is_v5(sbp))
 		return xfs_sb_validate_v5_features(sbp);
 
+	/* versions prior to v4 are not supported */
+	if (XFS_SB_VERSION_NUM(sbp) != XFS_SB_VERSION_4)
+		return false;
+
 	/* We must not have any unknown v4 feature bits set */
 	if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
 	    ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
 	     (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
 		return false;
 
-	/* versions prior to v4 are not supported */
-	if (XFS_SB_VERSION_NUM(sbp) < XFS_SB_VERSION_4)
-		return false;
-
 	/* V4 filesystems need v2 directories and unwritten extents */
 	if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
 		return false;
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 5ebdda7e1078..851220021484 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -204,6 +204,18 @@ enum xfs_ag_resv_type {
 	XFS_AG_RESV_RMAPBT,
 };
 
+/* Results of scanning a btree keyspace to check occupancy. */
+enum xbtree_recpacking {
+	/* None of the keyspace maps to records. */
+	XBTREE_RECPACKING_EMPTY = 0,
+
+	/* Some, but not all, of the keyspace maps to records. */
+	XBTREE_RECPACKING_SPARSE,
+
+	/* The entire keyspace maps to records. */
+	XBTREE_RECPACKING_FULL,
+};
+
 /*
  * Type verifier functions
  */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 4dd52b15f09c..6c6e5eba42c8 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2017 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -18,6 +18,15 @@
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 
+int
+xchk_setup_agheader(
+	struct xfs_scrub	*sc)
+{
+	if (xchk_need_intent_drain(sc))
+		xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+	return xchk_setup_fs(sc);
+}
+
 /* Superblock */
 
 /* Cross-reference with the other btrees. */
@@ -42,8 +51,9 @@ xchk_superblock_xref(
 
 	xchk_xref_is_used_space(sc, agbno, 1);
 	xchk_xref_is_not_inode_chunk(sc, agbno, 1);
-	xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
+	xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
 	xchk_xref_is_not_shared(sc, agbno, 1);
+	xchk_xref_is_not_cow_staging(sc, agbno, 1);
 
 	/* scrub teardown will take care of sc->sa for us */
 }
@@ -505,9 +515,10 @@ xchk_agf_xref(
 	xchk_agf_xref_freeblks(sc);
 	xchk_agf_xref_cntbt(sc);
 	xchk_xref_is_not_inode_chunk(sc, agbno, 1);
-	xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
+	xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
 	xchk_agf_xref_btreeblks(sc);
 	xchk_xref_is_not_shared(sc, agbno, 1);
+	xchk_xref_is_not_cow_staging(sc, agbno, 1);
 	xchk_agf_xref_refcblks(sc);
 
 	/* scrub teardown will take care of sc->sa for us */
@@ -633,8 +644,9 @@ xchk_agfl_block_xref(
 
 	xchk_xref_is_used_space(sc, agbno, 1);
 	xchk_xref_is_not_inode_chunk(sc, agbno, 1);
-	xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_AG);
+	xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_AG);
 	xchk_xref_is_not_shared(sc, agbno, 1);
+	xchk_xref_is_not_cow_staging(sc, agbno, 1);
 }
 
 /* Scrub an AGFL block. */
@@ -689,8 +701,9 @@ xchk_agfl_xref(
 
 	xchk_xref_is_used_space(sc, agbno, 1);
 	xchk_xref_is_not_inode_chunk(sc, agbno, 1);
-	xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
+	xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
 	xchk_xref_is_not_shared(sc, agbno, 1);
+	xchk_xref_is_not_cow_staging(sc, agbno, 1);
 
 	/*
 	 * Scrub teardown will take care of sc->sa for us.  Leave sc->sa
@@ -844,8 +857,9 @@ xchk_agi_xref(
 	xchk_xref_is_used_space(sc, agbno, 1);
 	xchk_xref_is_not_inode_chunk(sc, agbno, 1);
 	xchk_agi_xref_icounts(sc);
-	xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
+	xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
 	xchk_xref_is_not_shared(sc, agbno, 1);
+	xchk_xref_is_not_cow_staging(sc, agbno, 1);
 	xchk_agi_xref_fiblocks(sc);
 
 	/* scrub teardown will take care of sc->sa for us */
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index c37e6d72760b..bbaa65422c4f 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2018 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -487,10 +487,11 @@ xrep_agfl_walk_rmap(
 /* Strike out the blocks that are cross-linked according to the rmapbt. */
 STATIC int
 xrep_agfl_check_extent(
-	struct xrep_agfl	*ra,
 	uint64_t		start,
-	uint64_t		len)
+	uint64_t		len,
+	void			*priv)
 {
+	struct xrep_agfl	*ra = priv;
 	xfs_agblock_t		agbno = XFS_FSB_TO_AGBNO(ra->sc->mp, start);
 	xfs_agblock_t		last_agbno = agbno + len - 1;
 	int			error;
@@ -538,7 +539,6 @@ xrep_agfl_collect_blocks(
 	struct xrep_agfl	ra;
 	struct xfs_mount	*mp = sc->mp;
 	struct xfs_btree_cur	*cur;
-	struct xbitmap_range	*br, *n;
 	int			error;
 
 	ra.sc = sc;
@@ -579,11 +579,7 @@ xrep_agfl_collect_blocks(
 
 	/* Strike out the blocks that are cross-linked. */
 	ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
-	for_each_xbitmap_extent(br, n, agfl_extents) {
-		error = xrep_agfl_check_extent(&ra, br->start, br->len);
-		if (error)
-			break;
-	}
+	error = xbitmap_walk(agfl_extents, xrep_agfl_check_extent, &ra);
 	xfs_btree_del_cursor(ra.rmap_cur, error);
 	if (error)
 		goto out_bmp;
@@ -629,21 +625,58 @@ xrep_agfl_update_agf(
 			XFS_AGF_FLFIRST | XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
 }
 
+struct xrep_agfl_fill {
+	struct xbitmap		used_extents;
+	struct xfs_scrub	*sc;
+	__be32			*agfl_bno;
+	xfs_agblock_t		flcount;
+	unsigned int		fl_off;
+};
+
+/* Fill the AGFL with whatever blocks are in this extent. */
+static int
+xrep_agfl_fill(
+	uint64_t		start,
+	uint64_t		len,
+	void			*priv)
+{
+	struct xrep_agfl_fill	*af = priv;
+	struct xfs_scrub	*sc = af->sc;
+	xfs_fsblock_t		fsbno = start;
+	int			error;
+
+	while (fsbno < start + len && af->fl_off < af->flcount)
+		af->agfl_bno[af->fl_off++] =
+				cpu_to_be32(XFS_FSB_TO_AGBNO(sc->mp, fsbno++));
+
+	trace_xrep_agfl_insert(sc->mp, sc->sa.pag->pag_agno,
+			XFS_FSB_TO_AGBNO(sc->mp, start), len);
+
+	error = xbitmap_set(&af->used_extents, start, fsbno - 1);
+	if (error)
+		return error;
+
+	if (af->fl_off == af->flcount)
+		return -ECANCELED;
+
+	return 0;
+}
+
 /* Write out a totally new AGFL. */
-STATIC void
+STATIC int
 xrep_agfl_init_header(
 	struct xfs_scrub	*sc,
 	struct xfs_buf		*agfl_bp,
 	struct xbitmap		*agfl_extents,
 	xfs_agblock_t		flcount)
 {
+	struct xrep_agfl_fill	af = {
+		.sc		= sc,
+		.flcount	= flcount,
+	};
 	struct xfs_mount	*mp = sc->mp;
-	__be32			*agfl_bno;
-	struct xbitmap_range	*br;
-	struct xbitmap_range	*n;
 	struct xfs_agfl		*agfl;
-	xfs_agblock_t		agbno;
-	unsigned int		fl_off;
+	int			error;
 
 	ASSERT(flcount <= xfs_agfl_size(mp));
 
@@ -662,36 +695,18 @@ xrep_agfl_init_header(
 	 * blocks than fit in the AGFL, they will be freed in a subsequent
 	 * step.
 	 */
-	fl_off = 0;
-	agfl_bno = xfs_buf_to_agfl_bno(agfl_bp);
-	for_each_xbitmap_extent(br, n, agfl_extents) {
-		agbno = XFS_FSB_TO_AGBNO(mp, br->start);
-
-		trace_xrep_agfl_insert(mp, sc->sa.pag->pag_agno, agbno,
-				br->len);
-
-		while (br->len > 0 && fl_off < flcount) {
-			agfl_bno[fl_off] = cpu_to_be32(agbno);
-			fl_off++;
-			agbno++;
-
-			/*
-			 * We've now used br->start by putting it in the AGFL,
-			 * so bump br so that we don't reap the block later.
-			 */
-			br->start++;
-			br->len--;
-		}
-
-		if (br->len)
-			break;
-		list_del(&br->list);
-		kfree(br);
-	}
+	xbitmap_init(&af.used_extents);
+	af.agfl_bno = xfs_buf_to_agfl_bno(agfl_bp),
+	xbitmap_walk(agfl_extents, xrep_agfl_fill, &af);
+	error = xbitmap_disunion(agfl_extents, &af.used_extents);
+	if (error)
+		return error;
 
 	/* Write new AGFL to disk. */
 	xfs_trans_buf_set_type(sc->tp, agfl_bp, XFS_BLFT_AGFL_BUF);
 	xfs_trans_log_buf(sc->tp, agfl_bp, 0, BBTOB(agfl_bp->b_length) - 1);
+	xbitmap_destroy(&af.used_extents);
+	return 0;
 }
 
 /* Repair the AGFL. */
@@ -744,7 +759,9 @@ xrep_agfl(
 	 * buffers until we know that part works.
 	 */
 	xrep_agfl_update_agf(sc, agf_bp, flcount);
-	xrep_agfl_init_header(sc, agfl_bp, &agfl_extents, flcount);
+	error = xrep_agfl_init_header(sc, agfl_bp, &agfl_extents, flcount);
+	if (error)
+		goto err;
 
 	/*
 	 * Ok, the AGFL should be ready to go now.  Roll the transaction to
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 3b38f4e2a537..279af72b1671 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2017 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -24,10 +24,19 @@ int
 xchk_setup_ag_allocbt(
 	struct xfs_scrub	*sc)
 {
+	if (xchk_need_intent_drain(sc))
+		xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
 	return xchk_setup_ag_btree(sc, false);
 }
 
 /* Free space btree scrubber. */
+
+struct xchk_alloc {
+	/* Previous free space extent. */
+	struct xfs_alloc_rec_incore	prev;
+};
+
 /*
  * Ensure there's a corresponding cntbt/bnobt record matching this
  * bnobt/cntbt record, respectively.
@@ -75,9 +84,11 @@ xchk_allocbt_xref_other(
 STATIC void
 xchk_allocbt_xref(
 	struct xfs_scrub	*sc,
-	xfs_agblock_t		agbno,
-	xfs_extlen_t		len)
+	const struct xfs_alloc_rec_incore *irec)
 {
+	xfs_agblock_t		agbno = irec->ar_startblock;
+	xfs_extlen_t		len = irec->ar_blockcount;
+
 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
 		return;
 
@@ -85,25 +96,44 @@ xchk_allocbt_xref(
 	xchk_xref_is_not_inode_chunk(sc, agbno, len);
 	xchk_xref_has_no_owner(sc, agbno, len);
 	xchk_xref_is_not_shared(sc, agbno, len);
+	xchk_xref_is_not_cow_staging(sc, agbno, len);
+}
+
+/* Flag failures for records that could be merged. */
+STATIC void
+xchk_allocbt_mergeable(
+	struct xchk_btree	*bs,
+	struct xchk_alloc	*ca,
+	const struct xfs_alloc_rec_incore *irec)
+{
+	if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	if (ca->prev.ar_blockcount > 0 &&
+	    ca->prev.ar_startblock + ca->prev.ar_blockcount == irec->ar_startblock &&
+	    ca->prev.ar_blockcount + irec->ar_blockcount < (uint32_t)~0U)
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+	memcpy(&ca->prev, irec, sizeof(*irec));
 }
 
 /* Scrub a bnobt/cntbt record. */
 STATIC int
 xchk_allocbt_rec(
-	struct xchk_btree	*bs,
-	const union xfs_btree_rec *rec)
+	struct xchk_btree		*bs,
+	const union xfs_btree_rec	*rec)
 {
-	struct xfs_perag	*pag = bs->cur->bc_ag.pag;
-	xfs_agblock_t		bno;
-	xfs_extlen_t		len;
+	struct xfs_alloc_rec_incore	irec;
+	struct xchk_alloc	*ca = bs->private;
 
-	bno = be32_to_cpu(rec->alloc.ar_startblock);
-	len = be32_to_cpu(rec->alloc.ar_blockcount);
-
-	if (!xfs_verify_agbext(pag, bno, len))
+	xfs_alloc_btrec_to_irec(rec, &irec);
+	if (xfs_alloc_check_irec(bs->cur, &irec) != NULL) {
 		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+		return 0;
+	}
 
-	xchk_allocbt_xref(bs->sc, bno, len);
+	xchk_allocbt_mergeable(bs, ca, &irec);
+	xchk_allocbt_xref(bs->sc, &irec);
 
 	return 0;
 }
@@ -114,10 +144,11 @@ xchk_allocbt(
 	struct xfs_scrub	*sc,
 	xfs_btnum_t		which)
 {
+	struct xchk_alloc	ca = { };
 	struct xfs_btree_cur	*cur;
 
 	cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur;
-	return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, NULL);
+	return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca);
 }
 
 int
@@ -141,15 +172,15 @@ xchk_xref_is_used_space(
 	xfs_agblock_t		agbno,
 	xfs_extlen_t		len)
 {
-	bool			is_freesp;
+	enum xbtree_recpacking	outcome;
 	int			error;
 
 	if (!sc->sa.bno_cur || xchk_skip_xref(sc->sm))
 		return;
 
-	error = xfs_alloc_has_record(sc->sa.bno_cur, agbno, len, &is_freesp);
+	error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome);
 	if (!xchk_should_check_xref(sc, &error, &sc->sa.bno_cur))
 		return;
-	if (is_freesp)
+	if (outcome != XBTREE_RECPACKING_EMPTY)
 		xchk_btree_xref_set_corrupt(sc, sc->sa.bno_cur, 0);
 }
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 31529b9bf389..6c16d9530cca 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2017 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -15,11 +15,51 @@
 #include "xfs_da_btree.h"
 #include "xfs_attr.h"
 #include "xfs_attr_leaf.h"
+#include "xfs_attr_sf.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/dabtree.h"
 #include "scrub/attr.h"
 
+/* Free the buffers linked from the xattr buffer. */
+static void
+xchk_xattr_buf_cleanup(
+	void			*priv)
+{
+	struct xchk_xattr_buf	*ab = priv;
+
+	kvfree(ab->freemap);
+	ab->freemap = NULL;
+	kvfree(ab->usedmap);
+	ab->usedmap = NULL;
+	kvfree(ab->value);
+	ab->value = NULL;
+	ab->value_sz = 0;
+}
+
+/*
+ * Allocate the free space bitmap if we're trying harder; there are leaf blocks
+ * in the attr fork; or we can't tell if there are leaf blocks.
+ */
+static inline bool
+xchk_xattr_want_freemap(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_ifork	*ifp;
+
+	if (sc->flags & XCHK_TRY_HARDER)
+		return true;
+
+	if (!sc->ip)
+		return true;
+
+	ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
+	if (!ifp)
+		return false;
+
+	return xfs_ifork_has_extents(ifp);
+}
+
 /*
  * Allocate enough memory to hold an attr value and attr block bitmaps,
  * reallocating the buffer if necessary.  Buffer contents are not preserved
@@ -28,41 +68,49 @@
 static int
 xchk_setup_xattr_buf(
 	struct xfs_scrub	*sc,
-	size_t			value_size,
-	gfp_t			flags)
+	size_t			value_size)
 {
-	size_t			sz;
+	size_t			bmp_sz;
 	struct xchk_xattr_buf	*ab = sc->buf;
+	void			*new_val;
 
-	/*
-	 * We need enough space to read an xattr value from the file or enough
-	 * space to hold three copies of the xattr free space bitmap.  We don't
-	 * need the buffer space for both purposes at the same time.
-	 */
-	sz = 3 * sizeof(long) * BITS_TO_LONGS(sc->mp->m_attr_geo->blksize);
-	sz = max_t(size_t, sz, value_size);
+	bmp_sz = sizeof(long) * BITS_TO_LONGS(sc->mp->m_attr_geo->blksize);
 
-	/*
-	 * If there's already a buffer, figure out if we need to reallocate it
-	 * to accommodate a larger size.
-	 */
-	if (ab) {
-		if (sz <= ab->sz)
-			return 0;
-		kvfree(ab);
-		sc->buf = NULL;
-	}
+	if (ab)
+		goto resize_value;
 
-	/*
-	 * Don't zero the buffer upon allocation to avoid runtime overhead.
-	 * All users must be careful never to read uninitialized contents.
-	 */
-	ab = kvmalloc(sizeof(*ab) + sz, flags);
+	ab = kvzalloc(sizeof(struct xchk_xattr_buf), XCHK_GFP_FLAGS);
 	if (!ab)
 		return -ENOMEM;
-
-	ab->sz = sz;
 	sc->buf = ab;
+	sc->buf_cleanup = xchk_xattr_buf_cleanup;
+
+	ab->usedmap = kvmalloc(bmp_sz, XCHK_GFP_FLAGS);
+	if (!ab->usedmap)
+		return -ENOMEM;
+
+	if (xchk_xattr_want_freemap(sc)) {
+		ab->freemap = kvmalloc(bmp_sz, XCHK_GFP_FLAGS);
+		if (!ab->freemap)
+			return -ENOMEM;
+	}
+
+resize_value:
+	if (ab->value_sz >= value_size)
+		return 0;
+
+	if (ab->value) {
+		kvfree(ab->value);
+		ab->value = NULL;
+		ab->value_sz = 0;
+	}
+
+	new_val = kvmalloc(value_size, XCHK_GFP_FLAGS);
+	if (!new_val)
+		return -ENOMEM;
+
+	ab->value = new_val;
+	ab->value_sz = value_size;
 	return 0;
 }
 
@@ -79,8 +127,7 @@ xchk_setup_xattr(
 	 * without the inode lock held, which means we can sleep.
 	 */
 	if (sc->flags & XCHK_TRY_HARDER) {
-		error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX,
-				XCHK_GFP_FLAGS);
+		error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX);
 		if (error)
 			return error;
 	}
@@ -111,11 +158,24 @@ xchk_xattr_listent(
 	int				namelen,
 	int				valuelen)
 {
+	struct xfs_da_args		args = {
+		.op_flags		= XFS_DA_OP_NOTIME,
+		.attr_filter		= flags & XFS_ATTR_NSP_ONDISK_MASK,
+		.geo			= context->dp->i_mount->m_attr_geo,
+		.whichfork		= XFS_ATTR_FORK,
+		.dp			= context->dp,
+		.name			= name,
+		.namelen		= namelen,
+		.hashval		= xfs_da_hashname(name, namelen),
+		.trans			= context->tp,
+		.valuelen		= valuelen,
+	};
+	struct xchk_xattr_buf		*ab;
 	struct xchk_xattr		*sx;
-	struct xfs_da_args		args = { NULL };
 	int				error = 0;
 
 	sx = container_of(context, struct xchk_xattr, context);
+	ab = sx->sc->buf;
 
 	if (xchk_should_terminate(sx->sc, &error)) {
 		context->seen_enough = error;
@@ -128,18 +188,32 @@ xchk_xattr_listent(
 		return;
 	}
 
+	/* Only one namespace bit allowed. */
+	if (hweight32(flags & XFS_ATTR_NSP_ONDISK_MASK) > 1) {
+		xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
+		goto fail_xref;
+	}
+
 	/* Does this name make sense? */
 	if (!xfs_attr_namecheck(name, namelen)) {
 		xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
-		return;
+		goto fail_xref;
 	}
 
 	/*
+	 * Local xattr values are stored in the attr leaf block, so we don't
+	 * need to retrieve the value from a remote block to detect corruption
+	 * problems.
+	 */
+	if (flags & XFS_ATTR_LOCAL)
+		goto fail_xref;
+
+	/*
 	 * Try to allocate enough memory to extrat the attr value.  If that
 	 * doesn't work, we overload the seen_enough variable to convey
 	 * the error message back to the main scrub function.
 	 */
-	error = xchk_setup_xattr_buf(sx->sc, valuelen, XCHK_GFP_FLAGS);
+	error = xchk_setup_xattr_buf(sx->sc, valuelen);
 	if (error == -ENOMEM)
 		error = -EDEADLOCK;
 	if (error) {
@@ -147,17 +221,7 @@ xchk_xattr_listent(
 		return;
 	}
 
-	args.op_flags = XFS_DA_OP_NOTIME;
-	args.attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK;
-	args.geo = context->dp->i_mount->m_attr_geo;
-	args.whichfork = XFS_ATTR_FORK;
-	args.dp = context->dp;
-	args.name = name;
-	args.namelen = namelen;
-	args.hashval = xfs_da_hashname(args.name, args.namelen);
-	args.trans = context->tp;
-	args.value = xchk_xattr_valuebuf(sx->sc);
-	args.valuelen = valuelen;
+	args.value = ab->value;
 
 	error = xfs_attr_get_ilocked(&args);
 	/* ENODATA means the hash lookup failed and the attr is bad */
@@ -213,25 +277,23 @@ xchk_xattr_set_map(
 STATIC bool
 xchk_xattr_check_freemap(
 	struct xfs_scrub		*sc,
-	unsigned long			*map,
 	struct xfs_attr3_icleaf_hdr	*leafhdr)
 {
-	unsigned long			*freemap = xchk_xattr_freemap(sc);
-	unsigned long			*dstmap = xchk_xattr_dstmap(sc);
+	struct xchk_xattr_buf		*ab = sc->buf;
 	unsigned int			mapsize = sc->mp->m_attr_geo->blksize;
 	int				i;
 
 	/* Construct bitmap of freemap contents. */
-	bitmap_zero(freemap, mapsize);
+	bitmap_zero(ab->freemap, mapsize);
 	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
-		if (!xchk_xattr_set_map(sc, freemap,
+		if (!xchk_xattr_set_map(sc, ab->freemap,
 				leafhdr->freemap[i].base,
 				leafhdr->freemap[i].size))
 			return false;
 	}
 
 	/* Look for bits that are set in freemap and are marked in use. */
-	return bitmap_and(dstmap, freemap, map, mapsize) == 0;
+	return !bitmap_intersects(ab->freemap, ab->usedmap, mapsize);
 }
 
 /*
@@ -251,7 +313,7 @@ xchk_xattr_entry(
 	__u32				*last_hashval)
 {
 	struct xfs_mount		*mp = ds->state->mp;
-	unsigned long			*usedmap = xchk_xattr_usedmap(ds->sc);
+	struct xchk_xattr_buf		*ab = ds->sc->buf;
 	char				*name_end;
 	struct xfs_attr_leaf_name_local	*lentry;
 	struct xfs_attr_leaf_name_remote *rentry;
@@ -291,7 +353,7 @@ xchk_xattr_entry(
 	if (name_end > buf_end)
 		xchk_da_set_corrupt(ds, level);
 
-	if (!xchk_xattr_set_map(ds->sc, usedmap, nameidx, namesize))
+	if (!xchk_xattr_set_map(ds->sc, ab->usedmap, nameidx, namesize))
 		xchk_da_set_corrupt(ds, level);
 	if (!(ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
 		*usedbytes += namesize;
@@ -311,35 +373,26 @@ xchk_xattr_block(
 	struct xfs_attr_leafblock	*leaf = bp->b_addr;
 	struct xfs_attr_leaf_entry	*ent;
 	struct xfs_attr_leaf_entry	*entries;
-	unsigned long			*usedmap;
+	struct xchk_xattr_buf		*ab = ds->sc->buf;
 	char				*buf_end;
 	size_t				off;
 	__u32				last_hashval = 0;
 	unsigned int			usedbytes = 0;
 	unsigned int			hdrsize;
 	int				i;
-	int				error;
 
 	if (*last_checked == blk->blkno)
 		return 0;
 
-	/* Allocate memory for block usage checking. */
-	error = xchk_setup_xattr_buf(ds->sc, 0, XCHK_GFP_FLAGS);
-	if (error == -ENOMEM)
-		return -EDEADLOCK;
-	if (error)
-		return error;
-	usedmap = xchk_xattr_usedmap(ds->sc);
-
 	*last_checked = blk->blkno;
-	bitmap_zero(usedmap, mp->m_attr_geo->blksize);
+	bitmap_zero(ab->usedmap, mp->m_attr_geo->blksize);
 
 	/* Check all the padding. */
 	if (xfs_has_crc(ds->sc->mp)) {
-		struct xfs_attr3_leafblock	*leaf = bp->b_addr;
+		struct xfs_attr3_leafblock	*leaf3 = bp->b_addr;
 
-		if (leaf->hdr.pad1 != 0 || leaf->hdr.pad2 != 0 ||
-		    leaf->hdr.info.hdr.pad != 0)
+		if (leaf3->hdr.pad1 != 0 || leaf3->hdr.pad2 != 0 ||
+		    leaf3->hdr.info.hdr.pad != 0)
 			xchk_da_set_corrupt(ds, level);
 	} else {
 		if (leaf->hdr.pad1 != 0 || leaf->hdr.info.pad != 0)
@@ -356,7 +409,7 @@ xchk_xattr_block(
 		xchk_da_set_corrupt(ds, level);
 	if (leafhdr.firstused < hdrsize)
 		xchk_da_set_corrupt(ds, level);
-	if (!xchk_xattr_set_map(ds->sc, usedmap, 0, hdrsize))
+	if (!xchk_xattr_set_map(ds->sc, ab->usedmap, 0, hdrsize))
 		xchk_da_set_corrupt(ds, level);
 
 	if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
@@ -370,7 +423,7 @@ xchk_xattr_block(
 	for (i = 0, ent = entries; i < leafhdr.count; ent++, i++) {
 		/* Mark the leaf entry itself. */
 		off = (char *)ent - (char *)leaf;
-		if (!xchk_xattr_set_map(ds->sc, usedmap, off,
+		if (!xchk_xattr_set_map(ds->sc, ab->usedmap, off,
 				sizeof(xfs_attr_leaf_entry_t))) {
 			xchk_da_set_corrupt(ds, level);
 			goto out;
@@ -384,7 +437,7 @@ xchk_xattr_block(
 			goto out;
 	}
 
-	if (!xchk_xattr_check_freemap(ds->sc, usedmap, &leafhdr))
+	if (!xchk_xattr_check_freemap(ds->sc, &leafhdr))
 		xchk_da_set_corrupt(ds, level);
 
 	if (leafhdr.usedbytes != usedbytes)
@@ -468,38 +521,115 @@ out:
 	return error;
 }
 
+/* Check space usage of shortform attrs. */
+STATIC int
+xchk_xattr_check_sf(
+	struct xfs_scrub		*sc)
+{
+	struct xchk_xattr_buf		*ab = sc->buf;
+	struct xfs_attr_shortform	*sf;
+	struct xfs_attr_sf_entry	*sfe;
+	struct xfs_attr_sf_entry	*next;
+	struct xfs_ifork		*ifp;
+	unsigned char			*end;
+	int				i;
+	int				error = 0;
+
+	ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
+
+	bitmap_zero(ab->usedmap, ifp->if_bytes);
+	sf = (struct xfs_attr_shortform *)sc->ip->i_af.if_u1.if_data;
+	end = (unsigned char *)ifp->if_u1.if_data + ifp->if_bytes;
+	xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(sf->hdr));
+
+	sfe = &sf->list[0];
+	if ((unsigned char *)sfe > end) {
+		xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+		return 0;
+	}
+
+	for (i = 0; i < sf->hdr.count; i++) {
+		unsigned char		*name = sfe->nameval;
+		unsigned char		*value = &sfe->nameval[sfe->namelen];
+
+		if (xchk_should_terminate(sc, &error))
+			return error;
+
+		next = xfs_attr_sf_nextentry(sfe);
+		if ((unsigned char *)next > end) {
+			xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+			break;
+		}
+
+		if (!xchk_xattr_set_map(sc, ab->usedmap,
+				(char *)sfe - (char *)sf,
+				sizeof(struct xfs_attr_sf_entry))) {
+			xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+			break;
+		}
+
+		if (!xchk_xattr_set_map(sc, ab->usedmap,
+				(char *)name - (char *)sf,
+				sfe->namelen)) {
+			xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+			break;
+		}
+
+		if (!xchk_xattr_set_map(sc, ab->usedmap,
+				(char *)value - (char *)sf,
+				sfe->valuelen)) {
+			xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+			break;
+		}
+
+		sfe = next;
+	}
+
+	return 0;
+}
+
 /* Scrub the extended attribute metadata. */
 int
 xchk_xattr(
 	struct xfs_scrub		*sc)
 {
-	struct xchk_xattr		sx;
+	struct xchk_xattr		sx = {
+		.sc			= sc,
+		.context		= {
+			.dp		= sc->ip,
+			.tp		= sc->tp,
+			.resynch	= 1,
+			.put_listent	= xchk_xattr_listent,
+			.allow_incomplete = true,
+		},
+	};
 	xfs_dablk_t			last_checked = -1U;
 	int				error = 0;
 
 	if (!xfs_inode_hasattr(sc->ip))
 		return -ENOENT;
 
-	memset(&sx, 0, sizeof(sx));
-	/* Check attribute tree structure */
-	error = xchk_da_btree(sc, XFS_ATTR_FORK, xchk_xattr_rec,
-			&last_checked);
+	/* Allocate memory for xattr checking. */
+	error = xchk_setup_xattr_buf(sc, 0);
+	if (error == -ENOMEM)
+		return -EDEADLOCK;
 	if (error)
-		goto out;
+		return error;
 
-	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-		goto out;
+	/* Check the physical structure of the xattr. */
+	if (sc->ip->i_af.if_format == XFS_DINODE_FMT_LOCAL)
+		error = xchk_xattr_check_sf(sc);
+	else
+		error = xchk_da_btree(sc, XFS_ATTR_FORK, xchk_xattr_rec,
+				&last_checked);
+	if (error)
+		return error;
 
-	/* Check that every attr key can also be looked up by hash. */
-	sx.context.dp = sc->ip;
-	sx.context.resynch = 1;
-	sx.context.put_listent = xchk_xattr_listent;
-	sx.context.tp = sc->tp;
-	sx.context.allow_incomplete = true;
-	sx.sc = sc;
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return 0;
 
 	/*
-	 * Look up every xattr in this file by name.
+	 * Look up every xattr in this file by name and hash.
 	 *
 	 * Use the backend implementation of xfs_attr_list to call
 	 * xchk_xattr_listent on every attribute key in this inode.
@@ -516,11 +646,11 @@ xchk_xattr(
 	 */
 	error = xfs_attr_list_ilocked(&sx.context);
 	if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error))
-		goto out;
+		return error;
 
 	/* Did our listent function try to return any errors? */
 	if (sx.context.seen_enough < 0)
-		error = sx.context.seen_enough;
-out:
-	return error;
+		return sx.context.seen_enough;
+
+	return 0;
 }
diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h
index 3590e10e3e62..48fd9402c432 100644
--- a/fs/xfs/scrub/attr.h
+++ b/fs/xfs/scrub/attr.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
- * Copyright (C) 2019 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2019-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #ifndef __XFS_SCRUB_ATTR_H__
 #define __XFS_SCRUB_ATTR_H__
@@ -10,59 +10,15 @@
  * Temporary storage for online scrub and repair of extended attributes.
  */
 struct xchk_xattr_buf {
-	/* Size of @buf, in bytes. */
-	size_t			sz;
+	/* Bitmap of used space in xattr leaf blocks and shortform forks. */
+	unsigned long		*usedmap;
 
-	/*
-	 * Memory buffer -- either used for extracting attr values while
-	 * walking the attributes; or for computing attr block bitmaps when
-	 * checking the attribute tree.
-	 *
-	 * Each bitmap contains enough bits to track every byte in an attr
-	 * block (rounded up to the size of an unsigned long).  The attr block
-	 * used space bitmap starts at the beginning of the buffer; the free
-	 * space bitmap follows immediately after; and we have a third buffer
-	 * for storing intermediate bitmap results.
-	 */
-	uint8_t			buf[];
-};
-
-/* A place to store attribute values. */
-static inline uint8_t *
-xchk_xattr_valuebuf(
-	struct xfs_scrub	*sc)
-{
-	struct xchk_xattr_buf	*ab = sc->buf;
-
-	return ab->buf;
-}
-
-/* A bitmap of space usage computed by walking an attr leaf block. */
-static inline unsigned long *
-xchk_xattr_usedmap(
-	struct xfs_scrub	*sc)
-{
-	struct xchk_xattr_buf	*ab = sc->buf;
+	/* Bitmap of free space in xattr leaf blocks. */
+	unsigned long		*freemap;
 
-	return (unsigned long *)ab->buf;
-}
-
-/* A bitmap of free space computed by walking attr leaf block free info. */
-static inline unsigned long *
-xchk_xattr_freemap(
-	struct xfs_scrub	*sc)
-{
-	return xchk_xattr_usedmap(sc) +
-			BITS_TO_LONGS(sc->mp->m_attr_geo->blksize);
-}
-
-/* A bitmap used to hold temporary results. */
-static inline unsigned long *
-xchk_xattr_dstmap(
-	struct xfs_scrub	*sc)
-{
-	return xchk_xattr_freemap(sc) +
-			BITS_TO_LONGS(sc->mp->m_attr_geo->blksize);
-}
+	/* Memory buffer used to extract xattr values. */
+	void			*value;
+	size_t			value_sz;
+};
 
 #endif	/* __XFS_SCRUB_ATTR_H__ */
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index a255f09e9f0a..0c959be396ea 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -1,11 +1,12 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2018 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
+#include "xfs_bit.h"
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
@@ -13,27 +14,160 @@
 #include "scrub/scrub.h"
 #include "scrub/bitmap.h"
 
+#include <linux/interval_tree_generic.h>
+
+struct xbitmap_node {
+	struct rb_node	bn_rbnode;
+
+	/* First set bit of this interval and subtree. */
+	uint64_t	bn_start;
+
+	/* Last set bit of this interval. */
+	uint64_t	bn_last;
+
+	/* Last set bit of this subtree.  Do not touch this. */
+	uint64_t	__bn_subtree_last;
+};
+
+/* Define our own interval tree type with uint64_t parameters. */
+
+#define START(node) ((node)->bn_start)
+#define LAST(node)  ((node)->bn_last)
+
 /*
- * Set a range of this bitmap.  Caller must ensure the range is not set.
- *
- * This is the logical equivalent of bitmap |= mask(start, len).
+ * These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll
+ * forward-declare them anyway for clarity.
  */
+static inline void
+xbitmap_tree_insert(struct xbitmap_node *node, struct rb_root_cached *root);
+
+static inline void
+xbitmap_tree_remove(struct xbitmap_node *node, struct rb_root_cached *root);
+
+static inline struct xbitmap_node *
+xbitmap_tree_iter_first(struct rb_root_cached *root, uint64_t start,
+			uint64_t last);
+
+static inline struct xbitmap_node *
+xbitmap_tree_iter_next(struct xbitmap_node *node, uint64_t start,
+		       uint64_t last);
+
+INTERVAL_TREE_DEFINE(struct xbitmap_node, bn_rbnode, uint64_t,
+		__bn_subtree_last, START, LAST, static inline, xbitmap_tree)
+
+/* Iterate each interval of a bitmap.  Do not change the bitmap. */
+#define for_each_xbitmap_extent(bn, bitmap) \
+	for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \
+				   struct xbitmap_node, bn_rbnode); \
+	     (bn) != NULL; \
+	     (bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \
+				   struct xbitmap_node, bn_rbnode))
+
+/* Clear a range of this bitmap. */
+int
+xbitmap_clear(
+	struct xbitmap		*bitmap,
+	uint64_t		start,
+	uint64_t		len)
+{
+	struct xbitmap_node	*bn;
+	struct xbitmap_node	*new_bn;
+	uint64_t		last = start + len - 1;
+
+	while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last))) {
+		if (bn->bn_start < start && bn->bn_last > last) {
+			uint64_t	old_last = bn->bn_last;
+
+			/* overlaps with the entire clearing range */
+			xbitmap_tree_remove(bn, &bitmap->xb_root);
+			bn->bn_last = start - 1;
+			xbitmap_tree_insert(bn, &bitmap->xb_root);
+
+			/* add an extent */
+			new_bn = kmalloc(sizeof(struct xbitmap_node),
+					XCHK_GFP_FLAGS);
+			if (!new_bn)
+				return -ENOMEM;
+			new_bn->bn_start = last + 1;
+			new_bn->bn_last = old_last;
+			xbitmap_tree_insert(new_bn, &bitmap->xb_root);
+		} else if (bn->bn_start < start) {
+			/* overlaps with the left side of the clearing range */
+			xbitmap_tree_remove(bn, &bitmap->xb_root);
+			bn->bn_last = start - 1;
+			xbitmap_tree_insert(bn, &bitmap->xb_root);
+		} else if (bn->bn_last > last) {
+			/* overlaps with the right side of the clearing range */
+			xbitmap_tree_remove(bn, &bitmap->xb_root);
+			bn->bn_start = last + 1;
+			xbitmap_tree_insert(bn, &bitmap->xb_root);
+			break;
+		} else {
+			/* in the middle of the clearing range */
+			xbitmap_tree_remove(bn, &bitmap->xb_root);
+			kfree(bn);
+		}
+	}
+
+	return 0;
+}
+
+/* Set a range of this bitmap. */
 int
 xbitmap_set(
 	struct xbitmap		*bitmap,
 	uint64_t		start,
 	uint64_t		len)
 {
-	struct xbitmap_range	*bmr;
+	struct xbitmap_node	*left;
+	struct xbitmap_node	*right;
+	uint64_t		last = start + len - 1;
+	int			error;
 
-	bmr = kmalloc(sizeof(struct xbitmap_range), XCHK_GFP_FLAGS);
-	if (!bmr)
-		return -ENOMEM;
+	/* Is this whole range already set? */
+	left = xbitmap_tree_iter_first(&bitmap->xb_root, start, last);
+	if (left && left->bn_start <= start && left->bn_last >= last)
+		return 0;
+
+	/* Clear out everything in the range we want to set. */
+	error = xbitmap_clear(bitmap, start, len);
+	if (error)
+		return error;
+
+	/* Do we have a left-adjacent extent? */
+	left = xbitmap_tree_iter_first(&bitmap->xb_root, start - 1, start - 1);
+	ASSERT(!left || left->bn_last + 1 == start);
+
+	/* Do we have a right-adjacent extent? */
+	right = xbitmap_tree_iter_first(&bitmap->xb_root, last + 1, last + 1);
+	ASSERT(!right || right->bn_start == last + 1);
 
-	INIT_LIST_HEAD(&bmr->list);
-	bmr->start = start;
-	bmr->len = len;
-	list_add_tail(&bmr->list, &bitmap->list);
+	if (left && right) {
+		/* combine left and right adjacent extent */
+		xbitmap_tree_remove(left, &bitmap->xb_root);
+		xbitmap_tree_remove(right, &bitmap->xb_root);
+		left->bn_last = right->bn_last;
+		xbitmap_tree_insert(left, &bitmap->xb_root);
+		kfree(right);
+	} else if (left) {
+		/* combine with left extent */
+		xbitmap_tree_remove(left, &bitmap->xb_root);
+		left->bn_last = last;
+		xbitmap_tree_insert(left, &bitmap->xb_root);
+	} else if (right) {
+		/* combine with right extent */
+		xbitmap_tree_remove(right, &bitmap->xb_root);
+		right->bn_start = start;
+		xbitmap_tree_insert(right, &bitmap->xb_root);
+	} else {
+		/* add an extent */
+		left = kmalloc(sizeof(struct xbitmap_node), XCHK_GFP_FLAGS);
+		if (!left)
+			return -ENOMEM;
+		left->bn_start = start;
+		left->bn_last = last;
+		xbitmap_tree_insert(left, &bitmap->xb_root);
+	}
 
 	return 0;
 }
@@ -43,12 +177,11 @@ void
 xbitmap_destroy(
 	struct xbitmap		*bitmap)
 {
-	struct xbitmap_range	*bmr;
-	struct xbitmap_range	*n;
+	struct xbitmap_node	*bn;
 
-	for_each_xbitmap_extent(bmr, n, bitmap) {
-		list_del(&bmr->list);
-		kfree(bmr);
+	while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) {
+		xbitmap_tree_remove(bn, &bitmap->xb_root);
+		kfree(bn);
 	}
 }
 
@@ -57,27 +190,7 @@ void
 xbitmap_init(
 	struct xbitmap		*bitmap)
 {
-	INIT_LIST_HEAD(&bitmap->list);
-}
-
-/* Compare two btree extents. */
-static int
-xbitmap_range_cmp(
-	void			*priv,
-	const struct list_head	*a,
-	const struct list_head	*b)
-{
-	struct xbitmap_range	*ap;
-	struct xbitmap_range	*bp;
-
-	ap = container_of(a, struct xbitmap_range, list);
-	bp = container_of(b, struct xbitmap_range, list);
-
-	if (ap->start > bp->start)
-		return 1;
-	if (ap->start < bp->start)
-		return -1;
-	return 0;
+	bitmap->xb_root = RB_ROOT_CACHED;
 }
 
 /*
@@ -94,118 +207,26 @@ xbitmap_range_cmp(
  *
  * This is the logical equivalent of bitmap &= ~sub.
  */
-#define LEFT_ALIGNED	(1 << 0)
-#define RIGHT_ALIGNED	(1 << 1)
 int
 xbitmap_disunion(
 	struct xbitmap		*bitmap,
 	struct xbitmap		*sub)
 {
-	struct list_head	*lp;
-	struct xbitmap_range	*br;
-	struct xbitmap_range	*new_br;
-	struct xbitmap_range	*sub_br;
-	uint64_t		sub_start;
-	uint64_t		sub_len;
-	int			state;
-	int			error = 0;
+	struct xbitmap_node	*bn;
+	int			error;
 
-	if (list_empty(&bitmap->list) || list_empty(&sub->list))
+	if (xbitmap_empty(bitmap) || xbitmap_empty(sub))
 		return 0;
-	ASSERT(!list_empty(&sub->list));
-
-	list_sort(NULL, &bitmap->list, xbitmap_range_cmp);
-	list_sort(NULL, &sub->list, xbitmap_range_cmp);
-
-	/*
-	 * Now that we've sorted both lists, we iterate bitmap once, rolling
-	 * forward through sub and/or bitmap as necessary until we find an
-	 * overlap or reach the end of either list.  We do not reset lp to the
-	 * head of bitmap nor do we reset sub_br to the head of sub.  The
-	 * list traversal is similar to merge sort, but we're deleting
-	 * instead.  In this manner we avoid O(n^2) operations.
-	 */
-	sub_br = list_first_entry(&sub->list, struct xbitmap_range,
-			list);
-	lp = bitmap->list.next;
-	while (lp != &bitmap->list) {
-		br = list_entry(lp, struct xbitmap_range, list);
-
-		/*
-		 * Advance sub_br and/or br until we find a pair that
-		 * intersect or we run out of extents.
-		 */
-		while (sub_br->start + sub_br->len <= br->start) {
-			if (list_is_last(&sub_br->list, &sub->list))
-				goto out;
-			sub_br = list_next_entry(sub_br, list);
-		}
-		if (sub_br->start >= br->start + br->len) {
-			lp = lp->next;
-			continue;
-		}
 
-		/* trim sub_br to fit the extent we have */
-		sub_start = sub_br->start;
-		sub_len = sub_br->len;
-		if (sub_br->start < br->start) {
-			sub_len -= br->start - sub_br->start;
-			sub_start = br->start;
-		}
-		if (sub_len > br->len)
-			sub_len = br->len;
-
-		state = 0;
-		if (sub_start == br->start)
-			state |= LEFT_ALIGNED;
-		if (sub_start + sub_len == br->start + br->len)
-			state |= RIGHT_ALIGNED;
-		switch (state) {
-		case LEFT_ALIGNED:
-			/* Coincides with only the left. */
-			br->start += sub_len;
-			br->len -= sub_len;
-			break;
-		case RIGHT_ALIGNED:
-			/* Coincides with only the right. */
-			br->len -= sub_len;
-			lp = lp->next;
-			break;
-		case LEFT_ALIGNED | RIGHT_ALIGNED:
-			/* Total overlap, just delete ex. */
-			lp = lp->next;
-			list_del(&br->list);
-			kfree(br);
-			break;
-		case 0:
-			/*
-			 * Deleting from the middle: add the new right extent
-			 * and then shrink the left extent.
-			 */
-			new_br = kmalloc(sizeof(struct xbitmap_range),
-					XCHK_GFP_FLAGS);
-			if (!new_br) {
-				error = -ENOMEM;
-				goto out;
-			}
-			INIT_LIST_HEAD(&new_br->list);
-			new_br->start = sub_start + sub_len;
-			new_br->len = br->start + br->len - new_br->start;
-			list_add(&new_br->list, &br->list);
-			br->len = sub_start - br->start;
-			lp = lp->next;
-			break;
-		default:
-			ASSERT(0);
-			break;
-		}
+	for_each_xbitmap_extent(bn, sub) {
+		error = xbitmap_clear(bitmap, bn->bn_start,
+				bn->bn_last - bn->bn_start + 1);
+		if (error)
+			return error;
 	}
 
-out:
-	return error;
+	return 0;
 }
-#undef LEFT_ALIGNED
-#undef RIGHT_ALIGNED
 
 /*
  * Record all btree blocks seen while iterating all records of a btree.
@@ -242,6 +263,38 @@ out:
  * For the 300th record we just exit, with the list being [1, 4, 2, 3].
  */
 
+/* Mark a btree block to the agblock bitmap. */
+STATIC int
+xagb_bitmap_visit_btblock(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	void			*priv)
+{
+	struct xagb_bitmap	*bitmap = priv;
+	struct xfs_buf		*bp;
+	xfs_fsblock_t		fsbno;
+	xfs_agblock_t		agbno;
+
+	xfs_btree_get_block(cur, level, &bp);
+	if (!bp)
+		return 0;
+
+	fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
+	agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+
+	return xagb_bitmap_set(bitmap, agbno, 1);
+}
+
+/* Mark all (per-AG) btree blocks in the agblock bitmap. */
+int
+xagb_bitmap_set_btblocks(
+	struct xagb_bitmap	*bitmap,
+	struct xfs_btree_cur	*cur)
+{
+	return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock,
+			XFS_BTREE_VISIT_ALL, bitmap);
+}
+
 /*
  * Record all the buffers pointed to by the btree cursor.  Callers already
  * engaged in a btree walk should call this function to capture the list of
@@ -304,12 +357,97 @@ uint64_t
 xbitmap_hweight(
 	struct xbitmap		*bitmap)
 {
-	struct xbitmap_range	*bmr;
-	struct xbitmap_range	*n;
+	struct xbitmap_node	*bn;
 	uint64_t		ret = 0;
 
-	for_each_xbitmap_extent(bmr, n, bitmap)
-		ret += bmr->len;
+	for_each_xbitmap_extent(bn, bitmap)
+		ret += bn->bn_last - bn->bn_start + 1;
 
 	return ret;
 }
+
+/* Call a function for every run of set bits in this bitmap. */
+int
+xbitmap_walk(
+	struct xbitmap		*bitmap,
+	xbitmap_walk_fn		fn,
+	void			*priv)
+{
+	struct xbitmap_node	*bn;
+	int			error = 0;
+
+	for_each_xbitmap_extent(bn, bitmap) {
+		error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv);
+		if (error)
+			break;
+	}
+
+	return error;
+}
+
+struct xbitmap_walk_bits {
+	xbitmap_walk_bits_fn	fn;
+	void			*priv;
+};
+
+/* Walk all the bits in a run. */
+static int
+xbitmap_walk_bits_in_run(
+	uint64_t			start,
+	uint64_t			len,
+	void				*priv)
+{
+	struct xbitmap_walk_bits	*wb = priv;
+	uint64_t			i;
+	int				error = 0;
+
+	for (i = start; i < start + len; i++) {
+		error = wb->fn(i, wb->priv);
+		if (error)
+			break;
+	}
+
+	return error;
+}
+
+/* Call a function for every set bit in this bitmap. */
+int
+xbitmap_walk_bits(
+	struct xbitmap			*bitmap,
+	xbitmap_walk_bits_fn		fn,
+	void				*priv)
+{
+	struct xbitmap_walk_bits	wb = {.fn = fn, .priv = priv};
+
+	return xbitmap_walk(bitmap, xbitmap_walk_bits_in_run, &wb);
+}
+
+/* Does this bitmap have no bits set at all? */
+bool
+xbitmap_empty(
+	struct xbitmap		*bitmap)
+{
+	return bitmap->xb_root.rb_root.rb_node == NULL;
+}
+
+/* Is the start of the range set or clear?  And for how long? */
+bool
+xbitmap_test(
+	struct xbitmap		*bitmap,
+	uint64_t		start,
+	uint64_t		*len)
+{
+	struct xbitmap_node	*bn;
+	uint64_t		last = start + *len - 1;
+
+	bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last);
+	if (!bn)
+		return false;
+	if (bn->bn_start <= start) {
+		if (bn->bn_last < last)
+			*len = bn->bn_last - start + 1;
+		return true;
+	}
+	*len = bn->bn_start - start;
+	return false;
+}
diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h
index 900646b72de1..84981724ecaf 100644
--- a/fs/xfs/scrub/bitmap.h
+++ b/fs/xfs/scrub/bitmap.h
@@ -1,31 +1,19 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2018 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #ifndef __XFS_SCRUB_BITMAP_H__
 #define __XFS_SCRUB_BITMAP_H__
 
-struct xbitmap_range {
-	struct list_head	list;
-	uint64_t		start;
-	uint64_t		len;
-};
-
 struct xbitmap {
-	struct list_head	list;
+	struct rb_root_cached	xb_root;
 };
 
 void xbitmap_init(struct xbitmap *bitmap);
 void xbitmap_destroy(struct xbitmap *bitmap);
 
-#define for_each_xbitmap_extent(bex, n, bitmap) \
-	list_for_each_entry_safe((bex), (n), &(bitmap)->list, list)
-
-#define for_each_xbitmap_block(b, bex, n, bitmap) \
-	list_for_each_entry_safe((bex), (n), &(bitmap)->list, list) \
-		for ((b) = (bex)->start; (b) < (bex)->start + (bex)->len; (b)++)
-
+int xbitmap_clear(struct xbitmap *bitmap, uint64_t start, uint64_t len);
 int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len);
 int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub);
 int xbitmap_set_btcur_path(struct xbitmap *bitmap,
@@ -34,4 +22,93 @@ int xbitmap_set_btblocks(struct xbitmap *bitmap,
 		struct xfs_btree_cur *cur);
 uint64_t xbitmap_hweight(struct xbitmap *bitmap);
 
+/*
+ * Return codes for the bitmap iterator functions are 0 to continue iterating,
+ * and non-zero to stop iterating.  Any non-zero value will be passed up to the
+ * iteration caller.  The special value -ECANCELED can be used to stop
+ * iteration, because neither bitmap iterator ever generates that error code on
+ * its own.  Callers must not modify the bitmap while walking it.
+ */
+typedef int (*xbitmap_walk_fn)(uint64_t start, uint64_t len, void *priv);
+int xbitmap_walk(struct xbitmap *bitmap, xbitmap_walk_fn fn,
+		void *priv);
+
+typedef int (*xbitmap_walk_bits_fn)(uint64_t bit, void *priv);
+int xbitmap_walk_bits(struct xbitmap *bitmap, xbitmap_walk_bits_fn fn,
+		void *priv);
+
+bool xbitmap_empty(struct xbitmap *bitmap);
+bool xbitmap_test(struct xbitmap *bitmap, uint64_t start, uint64_t *len);
+
+/* Bitmaps, but for type-checked for xfs_agblock_t */
+
+struct xagb_bitmap {
+	struct xbitmap	agbitmap;
+};
+
+static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap)
+{
+	xbitmap_init(&bitmap->agbitmap);
+}
+
+static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap)
+{
+	xbitmap_destroy(&bitmap->agbitmap);
+}
+
+static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap,
+		xfs_agblock_t start, xfs_extlen_t len)
+{
+	return xbitmap_clear(&bitmap->agbitmap, start, len);
+}
+static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap,
+		xfs_agblock_t start, xfs_extlen_t len)
+{
+	return xbitmap_set(&bitmap->agbitmap, start, len);
+}
+
+static inline bool
+xagb_bitmap_test(
+	struct xagb_bitmap	*bitmap,
+	xfs_agblock_t		start,
+	xfs_extlen_t		*len)
+{
+	uint64_t		biglen = *len;
+	bool			ret;
+
+	ret = xbitmap_test(&bitmap->agbitmap, start, &biglen);
+
+	if (start + biglen >= UINT_MAX) {
+		ASSERT(0);
+		biglen = UINT_MAX - start;
+	}
+
+	*len = biglen;
+	return ret;
+}
+
+static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap,
+		struct xagb_bitmap *sub)
+{
+	return xbitmap_disunion(&bitmap->agbitmap, &sub->agbitmap);
+}
+
+static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap)
+{
+	return xbitmap_hweight(&bitmap->agbitmap);
+}
+static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap)
+{
+	return xbitmap_empty(&bitmap->agbitmap);
+}
+
+static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap,
+		xbitmap_walk_fn fn, void *priv)
+{
+	return xbitmap_walk(&bitmap->agbitmap, fn, priv);
+}
+
+int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap,
+		struct xfs_btree_cur *cur);
+
 #endif	/* __XFS_SCRUB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index dbbc7037074c..87ab9f95a487 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2017 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -31,12 +31,15 @@ xchk_setup_inode_bmap(
 {
 	int			error;
 
-	error = xchk_get_inode(sc);
+	if (xchk_need_intent_drain(sc))
+		xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
+	error = xchk_iget_for_scrubbing(sc);
 	if (error)
 		goto out;
 
-	sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
-	xfs_ilock(sc->ip, sc->ilock_flags);
+	sc->ilock_flags = XFS_IOLOCK_EXCL;
+	xfs_ilock(sc->ip, XFS_IOLOCK_EXCL);
 
 	/*
 	 * We don't want any ephemeral data fork updates sitting around
@@ -47,6 +50,9 @@ xchk_setup_inode_bmap(
 	    sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) {
 		struct address_space	*mapping = VFS_I(sc->ip)->i_mapping;
 
+		sc->ilock_flags |= XFS_MMAPLOCK_EXCL;
+		xfs_ilock(sc->ip, XFS_MMAPLOCK_EXCL);
+
 		inode_dio_wait(VFS_I(sc->ip));
 
 		/*
@@ -90,11 +96,23 @@ out:
 
 struct xchk_bmap_info {
 	struct xfs_scrub	*sc;
+
+	/* Incore extent tree cursor */
 	struct xfs_iext_cursor	icur;
-	xfs_fileoff_t		lastoff;
+
+	/* Previous fork mapping that we examined */
+	struct xfs_bmbt_irec	prev_rec;
+
+	/* Is this a realtime fork? */
 	bool			is_rt;
+
+	/* May mappings point to shared space? */
 	bool			is_shared;
+
+	/* Was the incore extent tree loaded? */
 	bool			was_loaded;
+
+	/* Which inode fork are we checking? */
 	int			whichfork;
 };
 
@@ -147,49 +165,7 @@ xchk_bmap_get_rmap(
 	return has_rmap;
 }
 
-static inline bool
-xchk_bmap_has_prev(
-	struct xchk_bmap_info	*info,
-	struct xfs_bmbt_irec	*irec)
-{
-	struct xfs_bmbt_irec	got;
-	struct xfs_ifork	*ifp;
-
-	ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork);
-
-	if (!xfs_iext_peek_prev_extent(ifp, &info->icur, &got))
-		return false;
-	if (got.br_startoff + got.br_blockcount != irec->br_startoff)
-		return false;
-	if (got.br_startblock + got.br_blockcount != irec->br_startblock)
-		return false;
-	if (got.br_state != irec->br_state)
-		return false;
-	return true;
-}
-
-static inline bool
-xchk_bmap_has_next(
-	struct xchk_bmap_info	*info,
-	struct xfs_bmbt_irec	*irec)
-{
-	struct xfs_bmbt_irec	got;
-	struct xfs_ifork	*ifp;
-
-	ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork);
-
-	if (!xfs_iext_peek_next_extent(ifp, &info->icur, &got))
-		return false;
-	if (irec->br_startoff + irec->br_blockcount != got.br_startoff)
-		return false;
-	if (irec->br_startblock + irec->br_blockcount != got.br_startblock)
-		return false;
-	if (got.br_state != irec->br_state)
-		return false;
-	return true;
-}
-
-/* Make sure that we have rmapbt records for this extent. */
+/* Make sure that we have rmapbt records for this data/attr fork extent. */
 STATIC void
 xchk_bmap_xref_rmap(
 	struct xchk_bmap_info	*info,
@@ -198,41 +174,39 @@ xchk_bmap_xref_rmap(
 {
 	struct xfs_rmap_irec	rmap;
 	unsigned long long	rmap_end;
-	uint64_t		owner;
+	uint64_t		owner = info->sc->ip->i_ino;
 
 	if (!info->sc->sa.rmap_cur || xchk_skip_xref(info->sc->sm))
 		return;
 
-	if (info->whichfork == XFS_COW_FORK)
-		owner = XFS_RMAP_OWN_COW;
-	else
-		owner = info->sc->ip->i_ino;
-
 	/* Find the rmap record for this irec. */
 	if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap))
 		return;
 
-	/* Check the rmap. */
+	/*
+	 * The rmap must be an exact match for this incore file mapping record,
+	 * which may have arisen from multiple ondisk records.
+	 */
+	if (rmap.rm_startblock != agbno)
+		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+
 	rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount;
-	if (rmap.rm_startblock > agbno ||
-	    agbno + irec->br_blockcount > rmap_end)
+	if (rmap_end != agbno + irec->br_blockcount)
 		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
 				irec->br_startoff);
 
-	/*
-	 * Check the logical offsets if applicable.  CoW staging extents
-	 * don't track logical offsets since the mappings only exist in
-	 * memory.
-	 */
-	if (info->whichfork != XFS_COW_FORK) {
-		rmap_end = (unsigned long long)rmap.rm_offset +
-				rmap.rm_blockcount;
-		if (rmap.rm_offset > irec->br_startoff ||
-		    irec->br_startoff + irec->br_blockcount > rmap_end)
-			xchk_fblock_xref_set_corrupt(info->sc,
-					info->whichfork, irec->br_startoff);
-	}
+	/* Check the logical offsets. */
+	if (rmap.rm_offset != irec->br_startoff)
+		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
 
+	rmap_end = (unsigned long long)rmap.rm_offset + rmap.rm_blockcount;
+	if (rmap_end != irec->br_startoff + irec->br_blockcount)
+		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+
+	/* Check the owner */
 	if (rmap.rm_owner != owner)
 		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
 				irec->br_startoff);
@@ -244,8 +218,7 @@ xchk_bmap_xref_rmap(
 	 * records because the blocks are owned (on-disk) by the refcountbt,
 	 * which doesn't track unwritten state.
 	 */
-	if (owner != XFS_RMAP_OWN_COW &&
-	    !!(irec->br_state == XFS_EXT_UNWRITTEN) !=
+	if (!!(irec->br_state == XFS_EXT_UNWRITTEN) !=
 	    !!(rmap.rm_flags & XFS_RMAP_UNWRITTEN))
 		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
 				irec->br_startoff);
@@ -257,34 +230,60 @@ xchk_bmap_xref_rmap(
 	if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK)
 		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
 				irec->br_startoff);
+}
+
+/* Make sure that we have rmapbt records for this COW fork extent. */
+STATIC void
+xchk_bmap_xref_rmap_cow(
+	struct xchk_bmap_info	*info,
+	struct xfs_bmbt_irec	*irec,
+	xfs_agblock_t		agbno)
+{
+	struct xfs_rmap_irec	rmap;
+	unsigned long long	rmap_end;
+	uint64_t		owner = XFS_RMAP_OWN_COW;
+
+	if (!info->sc->sa.rmap_cur || xchk_skip_xref(info->sc->sm))
+		return;
+
+	/* Find the rmap record for this irec. */
+	if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap))
+		return;
 
 	/*
-	 * If the rmap starts before this bmbt record, make sure there's a bmbt
-	 * record for the previous offset that is contiguous with this mapping.
-	 * Skip this for CoW fork extents because the refcount btree (and not
-	 * the inode) is the ondisk owner for those extents.
+	 * CoW staging extents are owned by the refcount btree, so the rmap
+	 * can start before and end after the physical space allocated to this
+	 * mapping.  There are no offsets to check.
 	 */
-	if (info->whichfork != XFS_COW_FORK && rmap.rm_startblock < agbno &&
-	    !xchk_bmap_has_prev(info, irec)) {
+	if (rmap.rm_startblock > agbno)
+		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+
+	rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount;
+	if (rmap_end < agbno + irec->br_blockcount)
+		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+
+	/* Check the owner */
+	if (rmap.rm_owner != owner)
 		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
 				irec->br_startoff);
-		return;
-	}
 
 	/*
-	 * If the rmap ends after this bmbt record, make sure there's a bmbt
-	 * record for the next offset that is contiguous with this mapping.
-	 * Skip this for CoW fork extents because the refcount btree (and not
-	 * the inode) is the ondisk owner for those extents.
+	 * No flags allowed.  Note that the (in-memory) CoW fork distinguishes
+	 * between unwritten and written extents, but we don't track that in
+	 * the rmap records because the blocks are owned (on-disk) by the
+	 * refcountbt, which doesn't track unwritten state.
 	 */
-	rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount;
-	if (info->whichfork != XFS_COW_FORK &&
-	    rmap_end > agbno + irec->br_blockcount &&
-	    !xchk_bmap_has_next(info, irec)) {
+	if (rmap.rm_flags & XFS_RMAP_ATTR_FORK)
+		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+	if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK)
+		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+	if (rmap.rm_flags & XFS_RMAP_UNWRITTEN)
 		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
 				irec->br_startoff);
-		return;
-	}
 }
 
 /* Cross-reference a single rtdev extent record. */
@@ -305,6 +304,7 @@ xchk_bmap_iextent_xref(
 	struct xchk_bmap_info	*info,
 	struct xfs_bmbt_irec	*irec)
 {
+	struct xfs_owner_info	oinfo;
 	struct xfs_mount	*mp = info->sc->mp;
 	xfs_agnumber_t		agno;
 	xfs_agblock_t		agbno;
@@ -322,17 +322,35 @@ xchk_bmap_iextent_xref(
 
 	xchk_xref_is_used_space(info->sc, agbno, len);
 	xchk_xref_is_not_inode_chunk(info->sc, agbno, len);
-	xchk_bmap_xref_rmap(info, irec, agbno);
 	switch (info->whichfork) {
 	case XFS_DATA_FORK:
-		if (xfs_is_reflink_inode(info->sc->ip))
-			break;
-		fallthrough;
+		xchk_bmap_xref_rmap(info, irec, agbno);
+		if (!xfs_is_reflink_inode(info->sc->ip)) {
+			xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino,
+					info->whichfork, irec->br_startoff);
+			xchk_xref_is_only_owned_by(info->sc, agbno,
+					irec->br_blockcount, &oinfo);
+			xchk_xref_is_not_shared(info->sc, agbno,
+					irec->br_blockcount);
+		}
+		xchk_xref_is_not_cow_staging(info->sc, agbno,
+				irec->br_blockcount);
+		break;
 	case XFS_ATTR_FORK:
+		xchk_bmap_xref_rmap(info, irec, agbno);
+		xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino,
+				info->whichfork, irec->br_startoff);
+		xchk_xref_is_only_owned_by(info->sc, agbno, irec->br_blockcount,
+				&oinfo);
 		xchk_xref_is_not_shared(info->sc, agbno,
 				irec->br_blockcount);
+		xchk_xref_is_not_cow_staging(info->sc, agbno,
+				irec->br_blockcount);
 		break;
 	case XFS_COW_FORK:
+		xchk_bmap_xref_rmap_cow(info, irec, agbno);
+		xchk_xref_is_only_owned_by(info->sc, agbno, irec->br_blockcount,
+				&XFS_RMAP_OINFO_COW);
 		xchk_xref_is_cow_staging(info->sc, agbno,
 				irec->br_blockcount);
 		xchk_xref_is_not_shared(info->sc, agbno,
@@ -382,7 +400,8 @@ xchk_bmap_iextent(
 	 * Check for out-of-order extents.  This record could have come
 	 * from the incore list, for which there is no ordering check.
 	 */
-	if (irec->br_startoff < info->lastoff)
+	if (irec->br_startoff < info->prev_rec.br_startoff +
+				info->prev_rec.br_blockcount)
 		xchk_fblock_set_corrupt(info->sc, info->whichfork,
 				irec->br_startoff);
 
@@ -392,15 +411,7 @@ xchk_bmap_iextent(
 
 	xchk_bmap_dirattr_extent(ip, info, irec);
 
-	/* There should never be a "hole" extent in either extent list. */
-	if (irec->br_startblock == HOLESTARTBLOCK)
-		xchk_fblock_set_corrupt(info->sc, info->whichfork,
-				irec->br_startoff);
-
 	/* Make sure the extent points to a valid place. */
-	if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN)
-		xchk_fblock_set_corrupt(info->sc, info->whichfork,
-				irec->br_startoff);
 	if (info->is_rt &&
 	    !xfs_verify_rtext(mp, irec->br_startblock, irec->br_blockcount))
 		xchk_fblock_set_corrupt(info->sc, info->whichfork,
@@ -468,6 +479,12 @@ xchk_bmapbt_rec(
 		return 0;
 
 	xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
+	if (xfs_bmap_validate_extent(ip, info->whichfork, &irec) != NULL) {
+		xchk_fblock_set_corrupt(bs->sc, info->whichfork,
+				irec.br_startoff);
+		return 0;
+	}
+
 	if (!xfs_iext_lookup_extent(ip, ifp, irec.br_startoff, &icur,
 				&iext_irec) ||
 	    irec.br_startoff != iext_irec.br_startoff ||
@@ -618,45 +635,57 @@ xchk_bmap_check_ag_rmaps(
 	return error;
 }
 
-/* Make sure each rmap has a corresponding bmbt entry. */
-STATIC int
-xchk_bmap_check_rmaps(
-	struct xfs_scrub	*sc,
-	int			whichfork)
+/*
+ * Decide if we want to walk every rmap btree in the fs to make sure that each
+ * rmap for this file fork has corresponding bmbt entries.
+ */
+static bool
+xchk_bmap_want_check_rmaps(
+	struct xchk_bmap_info	*info)
 {
-	struct xfs_ifork	*ifp = xfs_ifork_ptr(sc->ip, whichfork);
-	struct xfs_perag	*pag;
-	xfs_agnumber_t		agno;
-	bool			zero_size;
-	int			error;
+	struct xfs_scrub	*sc = info->sc;
+	struct xfs_ifork	*ifp;
 
-	if (!xfs_has_rmapbt(sc->mp) ||
-	    whichfork == XFS_COW_FORK ||
-	    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
-		return 0;
+	if (!xfs_has_rmapbt(sc->mp))
+		return false;
+	if (info->whichfork == XFS_COW_FORK)
+		return false;
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return false;
 
 	/* Don't support realtime rmap checks yet. */
-	if (XFS_IS_REALTIME_INODE(sc->ip) && whichfork == XFS_DATA_FORK)
-		return 0;
-
-	ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL);
+	if (info->is_rt)
+		return false;
 
 	/*
-	 * Only do this for complex maps that are in btree format, or for
-	 * situations where we would seem to have a size but zero extents.
-	 * The inode repair code can zap broken iforks, which means we have
-	 * to flag this bmap as corrupt if there are rmaps that need to be
-	 * reattached.
+	 * The inode repair code zaps broken inode forks by resetting them back
+	 * to EXTENTS format and zero extent records.  If we encounter a fork
+	 * in this state along with evidence that the fork isn't supposed to be
+	 * empty, we need to scan the reverse mappings to decide if we're going
+	 * to rebuild the fork.  Data forks with nonzero file size are scanned.
+	 * xattr forks are never empty of content, so they are always scanned.
 	 */
+	ifp = xfs_ifork_ptr(sc->ip, info->whichfork);
+	if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && ifp->if_nextents == 0) {
+		if (info->whichfork == XFS_DATA_FORK &&
+		    i_size_read(VFS_I(sc->ip)) == 0)
+			return false;
 
-	if (whichfork == XFS_DATA_FORK)
-		zero_size = i_size_read(VFS_I(sc->ip)) == 0;
-	else
-		zero_size = false;
+		return true;
+	}
 
-	if (ifp->if_format != XFS_DINODE_FMT_BTREE &&
-	    (zero_size || ifp->if_nextents > 0))
-		return 0;
+	return false;
+}
+
+/* Make sure each rmap has a corresponding bmbt entry. */
+STATIC int
+xchk_bmap_check_rmaps(
+	struct xfs_scrub	*sc,
+	int			whichfork)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+	int			error;
 
 	for_each_perag(sc->mp, agno, pag) {
 		error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag);
@@ -683,7 +712,8 @@ xchk_bmap_iextent_delalloc(
 	 * Check for out-of-order extents.  This record could have come
 	 * from the incore list, for which there is no ordering check.
 	 */
-	if (irec->br_startoff < info->lastoff)
+	if (irec->br_startoff < info->prev_rec.br_startoff +
+				info->prev_rec.br_blockcount)
 		xchk_fblock_set_corrupt(info->sc, info->whichfork,
 				irec->br_startoff);
 
@@ -697,6 +727,101 @@ xchk_bmap_iextent_delalloc(
 				irec->br_startoff);
 }
 
+/* Decide if this individual fork mapping is ok. */
+static bool
+xchk_bmap_iext_mapping(
+	struct xchk_bmap_info		*info,
+	const struct xfs_bmbt_irec	*irec)
+{
+	/* There should never be a "hole" extent in either extent list. */
+	if (irec->br_startblock == HOLESTARTBLOCK)
+		return false;
+	if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN)
+		return false;
+	return true;
+}
+
+/* Are these two mappings contiguous with each other? */
+static inline bool
+xchk_are_bmaps_contiguous(
+	const struct xfs_bmbt_irec	*b1,
+	const struct xfs_bmbt_irec	*b2)
+{
+	/* Don't try to combine unallocated mappings. */
+	if (!xfs_bmap_is_real_extent(b1))
+		return false;
+	if (!xfs_bmap_is_real_extent(b2))
+		return false;
+
+	/* Does b2 come right after b1 in the logical and physical range? */
+	if (b1->br_startoff + b1->br_blockcount != b2->br_startoff)
+		return false;
+	if (b1->br_startblock + b1->br_blockcount != b2->br_startblock)
+		return false;
+	if (b1->br_state != b2->br_state)
+		return false;
+	return true;
+}
+
+/*
+ * Walk the incore extent records, accumulating consecutive contiguous records
+ * into a single incore mapping.  Returns true if @irec has been set to a
+ * mapping or false if there are no more mappings.  Caller must ensure that
+ * @info.icur is zeroed before the first call.
+ */
+static int
+xchk_bmap_iext_iter(
+	struct xchk_bmap_info	*info,
+	struct xfs_bmbt_irec	*irec)
+{
+	struct xfs_bmbt_irec	got;
+	struct xfs_ifork	*ifp;
+	xfs_filblks_t		prev_len;
+
+	ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork);
+
+	/* Advance to the next iextent record and check the mapping. */
+	xfs_iext_next(ifp, &info->icur);
+	if (!xfs_iext_get_extent(ifp, &info->icur, irec))
+		return false;
+
+	if (!xchk_bmap_iext_mapping(info, irec)) {
+		xchk_fblock_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+		return false;
+	}
+
+	/*
+	 * Iterate subsequent iextent records and merge them with the one
+	 * that we just read, if possible.
+	 */
+	prev_len = irec->br_blockcount;
+	while (xfs_iext_peek_next_extent(ifp, &info->icur, &got)) {
+		if (!xchk_are_bmaps_contiguous(irec, &got))
+			break;
+
+		if (!xchk_bmap_iext_mapping(info, &got)) {
+			xchk_fblock_set_corrupt(info->sc, info->whichfork,
+					got.br_startoff);
+			return false;
+		}
+
+		/*
+		 * Notify the user of mergeable records in the data or attr
+		 * forks.  CoW forks only exist in memory so we ignore them.
+		 */
+		if (info->whichfork != XFS_COW_FORK &&
+		    prev_len + got.br_blockcount > BMBT_BLOCKCOUNT_MASK)
+			xchk_ino_set_preen(info->sc, info->sc->ip->i_ino);
+
+		irec->br_blockcount += got.br_blockcount;
+		prev_len = got.br_blockcount;
+		xfs_iext_next(ifp, &info->icur);
+	}
+
+	return true;
+}
+
 /*
  * Scrub an inode fork's block mappings.
  *
@@ -776,10 +901,15 @@ xchk_bmap(
 	if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
 		goto out;
 
-	/* Scrub extent records. */
-	info.lastoff = 0;
-	ifp = xfs_ifork_ptr(ip, whichfork);
-	for_each_xfs_iext(ifp, &info.icur, &irec) {
+	/*
+	 * Scrub extent records.  We use a special iterator function here that
+	 * combines adjacent mappings if they are logically and physically
+	 * contiguous.   For large allocations that require multiple bmbt
+	 * records, this reduces the number of cross-referencing calls, which
+	 * reduces runtime.  Cross referencing with the rmap is simpler because
+	 * the rmap must match the combined mapping exactly.
+	 */
+	while (xchk_bmap_iext_iter(&info, &irec)) {
 		if (xchk_should_terminate(sc, &error) ||
 		    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
 			goto out;
@@ -794,12 +924,14 @@ xchk_bmap(
 			xchk_bmap_iextent_delalloc(ip, &info, &irec);
 		else
 			xchk_bmap_iextent(ip, &info, &irec);
-		info.lastoff = irec.br_startoff + irec.br_blockcount;
+		memcpy(&info.prev_rec, &irec, sizeof(struct xfs_bmbt_irec));
 	}
 
-	error = xchk_bmap_check_rmaps(sc, whichfork);
-	if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error))
-		goto out;
+	if (xchk_bmap_want_check_rmaps(&info)) {
+		error = xchk_bmap_check_rmaps(sc, whichfork);
+		if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error))
+			goto out;
+	}
 out:
 	return error;
 }
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 0fd36d5b4646..1935b9ce1885 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2017 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -36,6 +36,7 @@ __xchk_btree_process_error(
 
 	switch (*error) {
 	case -EDEADLOCK:
+	case -ECHRNG:
 		/* Used to restart an op with deadlock avoidance. */
 		trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
 		break;
@@ -118,6 +119,16 @@ xchk_btree_xref_set_corrupt(
 			__return_address);
 }
 
+void
+xchk_btree_set_preen(
+	struct xfs_scrub	*sc,
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_PREEN,
+			__return_address);
+}
+
 /*
  * Make sure this record is in order and doesn't stray outside of the parent
  * keys.
@@ -140,29 +151,30 @@ xchk_btree_rec(
 
 	trace_xchk_btree_rec(bs->sc, cur, 0);
 
-	/* If this isn't the first record, are they in order? */
-	if (cur->bc_levels[0].ptr > 1 &&
+	/* Are all records across all record blocks in order? */
+	if (bs->lastrec_valid &&
 	    !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec))
 		xchk_btree_set_corrupt(bs->sc, cur, 0);
 	memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len);
+	bs->lastrec_valid = true;
 
 	if (cur->bc_nlevels == 1)
 		return;
 
-	/* Is this at least as large as the parent low key? */
+	/* Is low_key(rec) at least as large as the parent low key? */
 	cur->bc_ops->init_key_from_rec(&key, rec);
 	keyblock = xfs_btree_get_block(cur, 1, &bp);
 	keyp = xfs_btree_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
-	if (cur->bc_ops->diff_two_keys(cur, &key, keyp) < 0)
+	if (xfs_btree_keycmp_lt(cur, &key, keyp))
 		xchk_btree_set_corrupt(bs->sc, cur, 1);
 
 	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
 		return;
 
-	/* Is this no larger than the parent high key? */
+	/* Is high_key(rec) no larger than the parent high key? */
 	cur->bc_ops->init_high_key_from_rec(&hkey, rec);
 	keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
-	if (cur->bc_ops->diff_two_keys(cur, keyp, &hkey) < 0)
+	if (xfs_btree_keycmp_lt(cur, keyp, &hkey))
 		xchk_btree_set_corrupt(bs->sc, cur, 1);
 }
 
@@ -187,29 +199,30 @@ xchk_btree_key(
 
 	trace_xchk_btree_key(bs->sc, cur, level);
 
-	/* If this isn't the first key, are they in order? */
-	if (cur->bc_levels[level].ptr > 1 &&
-	    !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1], key))
+	/* Are all low keys across all node blocks in order? */
+	if (bs->lastkey[level - 1].valid &&
+	    !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1].key, key))
 		xchk_btree_set_corrupt(bs->sc, cur, level);
-	memcpy(&bs->lastkey[level - 1], key, cur->bc_ops->key_len);
+	memcpy(&bs->lastkey[level - 1].key, key, cur->bc_ops->key_len);
+	bs->lastkey[level - 1].valid = true;
 
 	if (level + 1 >= cur->bc_nlevels)
 		return;
 
-	/* Is this at least as large as the parent low key? */
+	/* Is this block's low key at least as large as the parent low key? */
 	keyblock = xfs_btree_get_block(cur, level + 1, &bp);
 	keyp = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, keyblock);
-	if (cur->bc_ops->diff_two_keys(cur, key, keyp) < 0)
+	if (xfs_btree_keycmp_lt(cur, key, keyp))
 		xchk_btree_set_corrupt(bs->sc, cur, level);
 
 	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
 		return;
 
-	/* Is this no larger than the parent high key? */
+	/* Is this block's high key no larger than the parent high key? */
 	key = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, block);
 	keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
 			keyblock);
-	if (cur->bc_ops->diff_two_keys(cur, keyp, key) < 0)
+	if (xfs_btree_keycmp_lt(cur, keyp, key))
 		xchk_btree_set_corrupt(bs->sc, cur, level);
 }
 
@@ -389,7 +402,7 @@ xchk_btree_check_block_owner(
 	if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO)
 		bs->cur = NULL;
 
-	xchk_xref_is_owned_by(bs->sc, agbno, 1, bs->oinfo);
+	xchk_xref_is_only_owned_by(bs->sc, agbno, 1, bs->oinfo);
 	if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP)
 		bs->cur = NULL;
 
@@ -519,6 +532,48 @@ xchk_btree_check_minrecs(
 }
 
 /*
+ * If this btree block has a parent, make sure that the parent's keys capture
+ * the keyspace contained in this block.
+ */
+STATIC void
+xchk_btree_block_check_keys(
+	struct xchk_btree	*bs,
+	int			level,
+	struct xfs_btree_block	*block)
+{
+	union xfs_btree_key	block_key;
+	union xfs_btree_key	*block_high_key;
+	union xfs_btree_key	*parent_low_key, *parent_high_key;
+	struct xfs_btree_cur	*cur = bs->cur;
+	struct xfs_btree_block	*parent_block;
+	struct xfs_buf		*bp;
+
+	if (level == cur->bc_nlevels - 1)
+		return;
+
+	xfs_btree_get_keys(cur, block, &block_key);
+
+	/* Make sure the low key of this block matches the parent. */
+	parent_block = xfs_btree_get_block(cur, level + 1, &bp);
+	parent_low_key = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr,
+			parent_block);
+	if (xfs_btree_keycmp_ne(cur, &block_key, parent_low_key)) {
+		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
+		return;
+	}
+
+	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+		return;
+
+	/* Make sure the high key of this block matches the parent. */
+	parent_high_key = xfs_btree_high_key_addr(cur,
+			cur->bc_levels[level + 1].ptr, parent_block);
+	block_high_key = xfs_btree_high_key_from_key(cur, &block_key);
+	if (xfs_btree_keycmp_ne(cur, block_high_key, parent_high_key))
+		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
+}
+
+/*
  * Grab and scrub a btree block given a btree pointer.  Returns block
  * and buffer pointers (if applicable) if they're ok to use.
  */
@@ -569,7 +624,12 @@ xchk_btree_get_block(
 	 * Check the block's siblings; this function absorbs error codes
 	 * for us.
 	 */
-	return xchk_btree_block_check_siblings(bs, *pblock);
+	error = xchk_btree_block_check_siblings(bs, *pblock);
+	if (error)
+		return error;
+
+	xchk_btree_block_check_keys(bs, level, *pblock);
+	return 0;
 }
 
 /*
@@ -601,7 +661,7 @@ xchk_btree_block_keys(
 	parent_keys = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr,
 			parent_block);
 
-	if (cur->bc_ops->diff_two_keys(cur, &block_keys, parent_keys) != 0)
+	if (xfs_btree_keycmp_ne(cur, &block_keys, parent_keys))
 		xchk_btree_set_corrupt(bs->sc, cur, 1);
 
 	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
@@ -612,7 +672,7 @@ xchk_btree_block_keys(
 	high_pk = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
 			parent_block);
 
-	if (cur->bc_ops->diff_two_keys(cur, high_bk, high_pk) != 0)
+	if (xfs_btree_keycmp_ne(cur, high_bk, high_pk))
 		xchk_btree_set_corrupt(bs->sc, cur, 1);
 }
 
diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h
index da61a53a0b61..9d7b9ee8bef4 100644
--- a/fs/xfs/scrub/btree.h
+++ b/fs/xfs/scrub/btree.h
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2017 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #ifndef __XFS_SCRUB_BTREE_H__
 #define __XFS_SCRUB_BTREE_H__
@@ -19,6 +19,8 @@ bool xchk_btree_xref_process_error(struct xfs_scrub *sc,
 /* Check for btree corruption. */
 void xchk_btree_set_corrupt(struct xfs_scrub *sc,
 		struct xfs_btree_cur *cur, int level);
+void xchk_btree_set_preen(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
+		int level);
 
 /* Check for btree xref discrepancies. */
 void xchk_btree_xref_set_corrupt(struct xfs_scrub *sc,
@@ -29,6 +31,11 @@ typedef int (*xchk_btree_rec_fn)(
 	struct xchk_btree		*bs,
 	const union xfs_btree_rec	*rec);
 
+struct xchk_btree_key {
+	union xfs_btree_key		key;
+	bool				valid;
+};
+
 struct xchk_btree {
 	/* caller-provided scrub state */
 	struct xfs_scrub		*sc;
@@ -38,11 +45,12 @@ struct xchk_btree {
 	void				*private;
 
 	/* internal scrub state */
+	bool				lastrec_valid;
 	union xfs_btree_rec		lastrec;
 	struct list_head		to_check;
 
 	/* this element must come last! */
-	union xfs_btree_key		lastkey[];
+	struct xchk_btree_key		lastkey[];
 };
 
 /*
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 848a8e32e56f..9aa79665c608 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2017 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -75,6 +75,7 @@ __xchk_process_error(
 	case 0:
 		return true;
 	case -EDEADLOCK:
+	case -ECHRNG:
 		/* Used to restart an op with deadlock avoidance. */
 		trace_xchk_deadlock_retry(
 				sc->ip ? sc->ip : XFS_I(file_inode(sc->file)),
@@ -130,6 +131,7 @@ __xchk_fblock_process_error(
 	case 0:
 		return true;
 	case -EDEADLOCK:
+	case -ECHRNG:
 		/* Used to restart an op with deadlock avoidance. */
 		trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
 		break;
@@ -396,26 +398,19 @@ want_ag_read_header_failure(
 }
 
 /*
- * Grab the perag structure and all the headers for an AG.
+ * Grab the AG header buffers for the attached perag structure.
  *
  * The headers should be released by xchk_ag_free, but as a fail safe we attach
  * all the buffers we grab to the scrub transaction so they'll all be freed
- * when we cancel it.  Returns ENOENT if we can't grab the perag structure.
+ * when we cancel it.
  */
-int
-xchk_ag_read_headers(
+static inline int
+xchk_perag_read_headers(
 	struct xfs_scrub	*sc,
-	xfs_agnumber_t		agno,
 	struct xchk_ag		*sa)
 {
-	struct xfs_mount	*mp = sc->mp;
 	int			error;
 
-	ASSERT(!sa->pag);
-	sa->pag = xfs_perag_get(mp, agno);
-	if (!sa->pag)
-		return -ENOENT;
-
 	error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp);
 	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
 		return error;
@@ -427,6 +422,104 @@ xchk_ag_read_headers(
 	return 0;
 }
 
+/*
+ * Grab the AG headers for the attached perag structure and wait for pending
+ * intents to drain.
+ */
+static int
+xchk_perag_drain_and_lock(
+	struct xfs_scrub	*sc)
+{
+	struct xchk_ag		*sa = &sc->sa;
+	int			error = 0;
+
+	ASSERT(sa->pag != NULL);
+	ASSERT(sa->agi_bp == NULL);
+	ASSERT(sa->agf_bp == NULL);
+
+	do {
+		if (xchk_should_terminate(sc, &error))
+			return error;
+
+		error = xchk_perag_read_headers(sc, sa);
+		if (error)
+			return error;
+
+		/*
+		 * If we've grabbed an inode for scrubbing then we assume that
+		 * holding its ILOCK will suffice to coordinate with any intent
+		 * chains involving this inode.
+		 */
+		if (sc->ip)
+			return 0;
+
+		/*
+		 * Decide if this AG is quiet enough for all metadata to be
+		 * consistent with each other.  XFS allows the AG header buffer
+		 * locks to cycle across transaction rolls while processing
+		 * chains of deferred ops, which means that there could be
+		 * other threads in the middle of processing a chain of
+		 * deferred ops.  For regular operations we are careful about
+		 * ordering operations to prevent collisions between threads
+		 * (which is why we don't need a per-AG lock), but scrub and
+		 * repair have to serialize against chained operations.
+		 *
+		 * We just locked all the AG headers buffers; now take a look
+		 * to see if there are any intents in progress.  If there are,
+		 * drop the AG headers and wait for the intents to drain.
+		 * Since we hold all the AG header locks for the duration of
+		 * the scrub, this is the only time we have to sample the
+		 * intents counter; any threads increasing it after this point
+		 * can't possibly be in the middle of a chain of AG metadata
+		 * updates.
+		 *
+		 * Obviously, this should be slanted against scrub and in favor
+		 * of runtime threads.
+		 */
+		if (!xfs_perag_intent_busy(sa->pag))
+			return 0;
+
+		if (sa->agf_bp) {
+			xfs_trans_brelse(sc->tp, sa->agf_bp);
+			sa->agf_bp = NULL;
+		}
+
+		if (sa->agi_bp) {
+			xfs_trans_brelse(sc->tp, sa->agi_bp);
+			sa->agi_bp = NULL;
+		}
+
+		if (!(sc->flags & XCHK_FSGATES_DRAIN))
+			return -ECHRNG;
+		error = xfs_perag_intent_drain(sa->pag);
+		if (error == -ERESTARTSYS)
+			error = -EINTR;
+	} while (!error);
+
+	return error;
+}
+
+/*
+ * Grab the per-AG structure, grab all AG header buffers, and wait until there
+ * aren't any pending intents.  Returns -ENOENT if we can't grab the perag
+ * structure.
+ */
+int
+xchk_ag_read_headers(
+	struct xfs_scrub	*sc,
+	xfs_agnumber_t		agno,
+	struct xchk_ag		*sa)
+{
+	struct xfs_mount	*mp = sc->mp;
+
+	ASSERT(!sa->pag);
+	sa->pag = xfs_perag_get(mp, agno);
+	if (!sa->pag)
+		return -ENOENT;
+
+	return xchk_perag_drain_and_lock(sc);
+}
+
 /* Release all the AG btree cursors. */
 void
 xchk_ag_btcur_free(
@@ -550,6 +643,14 @@ xchk_ag_init(
 
 /* Per-scrubber setup functions */
 
+void
+xchk_trans_cancel(
+	struct xfs_scrub	*sc)
+{
+	xfs_trans_cancel(sc->tp);
+	sc->tp = NULL;
+}
+
 /*
  * Grab an empty transaction so that we can re-grab locked buffers if
  * one of our btrees turns out to be cyclic.
@@ -625,80 +726,273 @@ xchk_checkpoint_log(
 	return 0;
 }
 
+/* Verify that an inode is allocated ondisk, then return its cached inode. */
+int
+xchk_iget(
+	struct xfs_scrub	*sc,
+	xfs_ino_t		inum,
+	struct xfs_inode	**ipp)
+{
+	return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp);
+}
+
 /*
- * Given an inode and the scrub control structure, grab either the
- * inode referenced in the control structure or the inode passed in.
- * The inode is not locked.
+ * Try to grab an inode in a manner that avoids races with physical inode
+ * allocation.  If we can't, return the locked AGI buffer so that the caller
+ * can single-step the loading process to see where things went wrong.
+ * Callers must have a valid scrub transaction.
+ *
+ * If the iget succeeds, return 0, a NULL AGI, and the inode.
+ *
+ * If the iget fails, return the error, the locked AGI, and a NULL inode.  This
+ * can include -EINVAL and -ENOENT for invalid inode numbers or inodes that are
+ * no longer allocated; or any other corruption or runtime error.
+ *
+ * If the AGI read fails, return the error, a NULL AGI, and NULL inode.
+ *
+ * If a fatal signal is pending, return -EINTR, a NULL AGI, and a NULL inode.
  */
 int
-xchk_get_inode(
+xchk_iget_agi(
+	struct xfs_scrub	*sc,
+	xfs_ino_t		inum,
+	struct xfs_buf		**agi_bpp,
+	struct xfs_inode	**ipp)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_trans	*tp = sc->tp;
+	struct xfs_perag	*pag;
+	int			error;
+
+	ASSERT(sc->tp != NULL);
+
+again:
+	*agi_bpp = NULL;
+	*ipp = NULL;
+	error = 0;
+
+	if (xchk_should_terminate(sc, &error))
+		return error;
+
+	/*
+	 * Attach the AGI buffer to the scrub transaction to avoid deadlocks
+	 * in the iget cache miss path.
+	 */
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
+	error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
+	xfs_perag_put(pag);
+	if (error)
+		return error;
+
+	error = xfs_iget(mp, tp, inum,
+			XFS_IGET_NORETRY | XFS_IGET_UNTRUSTED, 0, ipp);
+	if (error == -EAGAIN) {
+		/*
+		 * The inode may be in core but temporarily unavailable and may
+		 * require the AGI buffer before it can be returned.  Drop the
+		 * AGI buffer and retry the lookup.
+		 *
+		 * Incore lookup will fail with EAGAIN on a cache hit if the
+		 * inode is queued to the inactivation list.  The inactivation
+		 * worker may remove the inode from the unlinked list and hence
+		 * needs the AGI.
+		 *
+		 * Hence xchk_iget_agi() needs to drop the AGI lock on EAGAIN
+		 * to allow inodegc to make progress and move the inode to
+		 * IRECLAIMABLE state where xfs_iget will be able to return it
+		 * again if it can lock the inode.
+		 */
+		xfs_trans_brelse(tp, *agi_bpp);
+		delay(1);
+		goto again;
+	}
+	if (error)
+		return error;
+
+	/* We got the inode, so we can release the AGI. */
+	ASSERT(*ipp != NULL);
+	xfs_trans_brelse(tp, *agi_bpp);
+	*agi_bpp = NULL;
+	return 0;
+}
+
+/* Install an inode that we opened by handle for scrubbing. */
+int
+xchk_install_handle_inode(
+	struct xfs_scrub	*sc,
+	struct xfs_inode	*ip)
+{
+	if (VFS_I(ip)->i_generation != sc->sm->sm_gen) {
+		xchk_irele(sc, ip);
+		return -ENOENT;
+	}
+
+	sc->ip = ip;
+	return 0;
+}
+
+/*
+ * In preparation to scrub metadata structures that hang off of an inode,
+ * grab either the inode referenced in the scrub control structure or the
+ * inode passed in.  If the inumber does not reference an allocated inode
+ * record, the function returns ENOENT to end the scrub early.  The inode
+ * is not locked.
+ */
+int
+xchk_iget_for_scrubbing(
 	struct xfs_scrub	*sc)
 {
 	struct xfs_imap		imap;
 	struct xfs_mount	*mp = sc->mp;
 	struct xfs_perag	*pag;
+	struct xfs_buf		*agi_bp;
 	struct xfs_inode	*ip_in = XFS_I(file_inode(sc->file));
 	struct xfs_inode	*ip = NULL;
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino);
 	int			error;
 
+	ASSERT(sc->tp == NULL);
+
 	/* We want to scan the inode we already had opened. */
 	if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) {
 		sc->ip = ip_in;
 		return 0;
 	}
 
-	/* Look up the inode, see if the generation number matches. */
+	/* Reject internal metadata files and obviously bad inode numbers. */
 	if (xfs_internal_inum(mp, sc->sm->sm_ino))
 		return -ENOENT;
-	error = xfs_iget(mp, NULL, sc->sm->sm_ino,
-			XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, &ip);
-	switch (error) {
-	case -ENOENT:
-		/* Inode doesn't exist, just bail out. */
+	if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
+		return -ENOENT;
+
+	/* Try a regular untrusted iget. */
+	error = xchk_iget(sc, sc->sm->sm_ino, &ip);
+	if (!error)
+		return xchk_install_handle_inode(sc, ip);
+	if (error == -ENOENT)
 		return error;
-	case 0:
-		/* Got an inode, continue. */
-		break;
-	case -EINVAL:
+	if (error != -EINVAL)
+		goto out_error;
+
+	/*
+	 * EINVAL with IGET_UNTRUSTED probably means one of several things:
+	 * userspace gave us an inode number that doesn't correspond to fs
+	 * space; the inode btree lacks a record for this inode; or there is a
+	 * record, and it says this inode is free.
+	 *
+	 * We want to look up this inode in the inobt to distinguish two
+	 * scenarios: (1) the inobt says the inode is free, in which case
+	 * there's nothing to do; and (2) the inobt says the inode is
+	 * allocated, but loading it failed due to corruption.
+	 *
+	 * Allocate a transaction and grab the AGI to prevent inobt activity
+	 * in this AG.  Retry the iget in case someone allocated a new inode
+	 * after the first iget failed.
+	 */
+	error = xchk_trans_alloc(sc, 0);
+	if (error)
+		goto out_error;
+
+	error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip);
+	if (error == 0) {
+		/* Actually got the inode, so install it. */
+		xchk_trans_cancel(sc);
+		return xchk_install_handle_inode(sc, ip);
+	}
+	if (error == -ENOENT)
+		goto out_gone;
+	if (error != -EINVAL)
+		goto out_cancel;
+
+	/* Ensure that we have protected against inode allocation/freeing. */
+	if (agi_bp == NULL) {
+		ASSERT(agi_bp != NULL);
+		error = -ECANCELED;
+		goto out_cancel;
+	}
+
+	/*
+	 * Untrusted iget failed a second time.  Let's try an inobt lookup.
+	 * If the inobt thinks this the inode neither can exist inside the
+	 * filesystem nor is allocated, return ENOENT to signal that the check
+	 * can be skipped.
+	 *
+	 * If the lookup returns corruption, we'll mark this inode corrupt and
+	 * exit to userspace.  There's little chance of fixing anything until
+	 * the inobt is straightened out, but there's nothing we can do here.
+	 *
+	 * If the lookup encounters any other error, exit to userspace.
+	 *
+	 * If the lookup succeeds, something else must be very wrong in the fs
+	 * such that setting up the incore inode failed in some strange way.
+	 * Treat those as corruptions.
+	 */
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino));
+	if (!pag) {
+		error = -EFSCORRUPTED;
+		goto out_cancel;
+	}
+
+	error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap,
+			XFS_IGET_UNTRUSTED);
+	xfs_perag_put(pag);
+	if (error == -EINVAL || error == -ENOENT)
+		goto out_gone;
+	if (!error)
+		error = -EFSCORRUPTED;
+
+out_cancel:
+	xchk_trans_cancel(sc);
+out_error:
+	trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
+			error, __return_address);
+	return error;
+out_gone:
+	/* The file is gone, so there's nothing to check. */
+	xchk_trans_cancel(sc);
+	return -ENOENT;
+}
+
+/* Release an inode, possibly dropping it in the process. */
+void
+xchk_irele(
+	struct xfs_scrub	*sc,
+	struct xfs_inode	*ip)
+{
+	if (current->journal_info != NULL) {
+		ASSERT(current->journal_info == sc->tp);
+
 		/*
-		 * -EINVAL with IGET_UNTRUSTED could mean one of several
-		 * things: userspace gave us an inode number that doesn't
-		 * correspond to fs space, or doesn't have an inobt entry;
-		 * or it could simply mean that the inode buffer failed the
-		 * read verifiers.
+		 * If we are in a transaction, we /cannot/ drop the inode
+		 * ourselves, because the VFS will trigger writeback, which
+		 * can require a transaction.  Clear DONTCACHE to force the
+		 * inode to the LRU, where someone else can take care of
+		 * dropping it.
 		 *
-		 * Try just the inode mapping lookup -- if it succeeds, then
-		 * the inode buffer verifier failed and something needs fixing.
-		 * Otherwise, we really couldn't find it so tell userspace
-		 * that it no longer exists.
+		 * Note that when we grabbed our reference to the inode, it
+		 * could have had an active ref and DONTCACHE set if a sysadmin
+		 * is trying to coerce a change in file access mode.  icache
+		 * hits do not clear DONTCACHE, so we must do it here.
 		 */
-		pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino));
-		if (pag) {
-			error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap,
-					XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE);
-			xfs_perag_put(pag);
-			if (error)
-				return -ENOENT;
-		}
-		error = -EFSCORRUPTED;
-		fallthrough;
-	default:
-		trace_xchk_op_error(sc,
-				XFS_INO_TO_AGNO(mp, sc->sm->sm_ino),
-				XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
-				error, __return_address);
-		return error;
-	}
-	if (VFS_I(ip)->i_generation != sc->sm->sm_gen) {
-		xfs_irele(ip);
-		return -ENOENT;
+		spin_lock(&VFS_I(ip)->i_lock);
+		VFS_I(ip)->i_state &= ~I_DONTCACHE;
+		spin_unlock(&VFS_I(ip)->i_lock);
+	} else if (atomic_read(&VFS_I(ip)->i_count) == 1) {
+		/*
+		 * If this is the last reference to the inode and the caller
+		 * permits it, set DONTCACHE to avoid thrashing.
+		 */
+		d_mark_dontcache(VFS_I(ip));
 	}
 
-	sc->ip = ip;
-	return 0;
+	xfs_irele(ip);
 }
 
-/* Set us up to scrub a file's contents. */
+/*
+ * Set us up to scrub metadata mapped by a file's fork.  Callers must not use
+ * this to operate on user-accessible regular file data because the MMAPLOCK is
+ * not taken.
+ */
 int
 xchk_setup_inode_contents(
 	struct xfs_scrub	*sc,
@@ -706,13 +1000,14 @@ xchk_setup_inode_contents(
 {
 	int			error;
 
-	error = xchk_get_inode(sc);
+	error = xchk_iget_for_scrubbing(sc);
 	if (error)
 		return error;
 
-	/* Got the inode, lock it and we're ready to go. */
-	sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+	/* Lock the inode so the VFS cannot touch this file. */
+	sc->ilock_flags = XFS_IOLOCK_EXCL;
 	xfs_ilock(sc->ip, sc->ilock_flags);
+
 	error = xchk_trans_alloc(sc, resblks);
 	if (error)
 		goto out;
@@ -869,28 +1164,6 @@ xchk_metadata_inode_forks(
 	return 0;
 }
 
-/*
- * Try to lock an inode in violation of the usual locking order rules.  For
- * example, trying to get the IOLOCK while in transaction context, or just
- * plain breaking AG-order or inode-order inode locking rules.  Either way,
- * the only way to avoid an ABBA deadlock is to use trylock and back off if
- * we can't.
- */
-int
-xchk_ilock_inverted(
-	struct xfs_inode	*ip,
-	uint			lock_mode)
-{
-	int			i;
-
-	for (i = 0; i < 20; i++) {
-		if (xfs_ilock_nowait(ip, lock_mode))
-			return 0;
-		delay(1);
-	}
-	return -EDEADLOCK;
-}
-
 /* Pause background reaping of resources. */
 void
 xchk_stop_reaping(
@@ -916,3 +1189,25 @@ xchk_start_reaping(
 	}
 	sc->flags &= ~XCHK_REAPING_DISABLED;
 }
+
+/*
+ * Enable filesystem hooks (i.e. runtime code patching) before starting a scrub
+ * operation.  Callers must not hold any locks that intersect with the CPU
+ * hotplug lock (e.g. writeback locks) because code patching must halt the CPUs
+ * to change kernel code.
+ */
+void
+xchk_fsgates_enable(
+	struct xfs_scrub	*sc,
+	unsigned int		scrub_fsgates)
+{
+	ASSERT(!(scrub_fsgates & ~XCHK_FSGATES_ALL));
+	ASSERT(!(sc->flags & scrub_fsgates));
+
+	trace_xchk_fsgates_enable(sc, scrub_fsgates);
+
+	if (scrub_fsgates & XCHK_FSGATES_DRAIN)
+		xfs_drain_wait_enable();
+
+	sc->flags |= scrub_fsgates;
+}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index b73648d81d23..18b5f2b62f13 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2017 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #ifndef __XFS_SCRUB_COMMON_H__
 #define __XFS_SCRUB_COMMON_H__
@@ -32,6 +32,8 @@ xchk_should_terminate(
 }
 
 int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks);
+void xchk_trans_cancel(struct xfs_scrub *sc);
+
 bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno,
 		xfs_agblock_t bno, int *error);
 bool xchk_fblock_process_error(struct xfs_scrub *sc, int whichfork,
@@ -72,6 +74,7 @@ bool xchk_should_check_xref(struct xfs_scrub *sc, int *error,
 			   struct xfs_btree_cur **curpp);
 
 /* Setup functions */
+int xchk_setup_agheader(struct xfs_scrub *sc);
 int xchk_setup_fs(struct xfs_scrub *sc);
 int xchk_setup_ag_allocbt(struct xfs_scrub *sc);
 int xchk_setup_ag_iallocbt(struct xfs_scrub *sc);
@@ -132,10 +135,16 @@ int xchk_count_rmap_ownedby_ag(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
 		const struct xfs_owner_info *oinfo, xfs_filblks_t *blocks);
 
 int xchk_setup_ag_btree(struct xfs_scrub *sc, bool force_log);
-int xchk_get_inode(struct xfs_scrub *sc);
+int xchk_iget_for_scrubbing(struct xfs_scrub *sc);
 int xchk_setup_inode_contents(struct xfs_scrub *sc, unsigned int resblks);
 void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp);
 
+int xchk_iget(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp);
+int xchk_iget_agi(struct xfs_scrub *sc, xfs_ino_t inum,
+		struct xfs_buf **agi_bpp, struct xfs_inode **ipp);
+void xchk_irele(struct xfs_scrub *sc, struct xfs_inode *ip);
+int xchk_install_handle_inode(struct xfs_scrub *sc, struct xfs_inode *ip);
+
 /*
  * Don't bother cross-referencing if we already found corruption or cross
  * referencing discrepancies.
@@ -147,8 +156,21 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm)
 }
 
 int xchk_metadata_inode_forks(struct xfs_scrub *sc);
-int xchk_ilock_inverted(struct xfs_inode *ip, uint lock_mode);
 void xchk_stop_reaping(struct xfs_scrub *sc);
 void xchk_start_reaping(struct xfs_scrub *sc);
 
+/*
+ * Setting up a hook to wait for intents to drain is costly -- we have to take
+ * the CPU hotplug lock and force an i-cache flush on all CPUs once to set it
+ * up, and again to tear it down.  These costs add up quickly, so we only want
+ * to enable the drain waiter if the drain actually detected a conflict with
+ * running intent chains.
+ */
+static inline bool xchk_need_intent_drain(struct xfs_scrub *sc)
+{
+	return sc->flags & XCHK_NEED_DRAIN;
+}
+
+void xchk_fsgates_enable(struct xfs_scrub *sc, unsigned int scrub_fshooks);
+
 #endif	/* __XFS_SCRUB_COMMON_H__ */
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index d17cee177085..82b150d3b8b7 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2017 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -39,6 +39,7 @@ xchk_da_process_error(
 
 	switch (*error) {
 	case -EDEADLOCK:
+	case -ECHRNG:
 		/* Used to restart an op with deadlock avoidance. */
 		trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
 		break;
diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h
index 1f3515c6d5a8..4f8c2138a1ec 100644
--- a/fs/xfs/scrub/dabtree.h
+++ b/fs/xfs/scrub/dabtree.h
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2017 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #ifndef __XFS_SCRUB_DABTREE_H__
 #define __XFS_SCRUB_DABTREE_H__
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index d1b0f23c2c59..0b491784b759 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2017 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -18,6 +18,7 @@
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/dabtree.h"
+#include "scrub/readdir.h"
 
 /* Set us up to scrub directories. */
 int
@@ -31,168 +32,114 @@ xchk_setup_directory(
 
 /* Scrub a directory entry. */
 
-struct xchk_dir_ctx {
-	/* VFS fill-directory iterator */
-	struct dir_context	dir_iter;
-
-	struct xfs_scrub	*sc;
-};
-
-/* Check that an inode's mode matches a given DT_ type. */
-STATIC int
+/* Check that an inode's mode matches a given XFS_DIR3_FT_* type. */
+STATIC void
 xchk_dir_check_ftype(
-	struct xchk_dir_ctx	*sdc,
+	struct xfs_scrub	*sc,
 	xfs_fileoff_t		offset,
-	xfs_ino_t		inum,
-	int			dtype)
+	struct xfs_inode	*ip,
+	int			ftype)
 {
-	struct xfs_mount	*mp = sdc->sc->mp;
-	struct xfs_inode	*ip;
-	int			ino_dtype;
-	int			error = 0;
+	struct xfs_mount	*mp = sc->mp;
 
 	if (!xfs_has_ftype(mp)) {
-		if (dtype != DT_UNKNOWN && dtype != DT_DIR)
-			xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
-					offset);
-		goto out;
-	}
-
-	/*
-	 * Grab the inode pointed to by the dirent.  We release the
-	 * inode before we cancel the scrub transaction.  Since we're
-	 * don't know a priori that releasing the inode won't trigger
-	 * eofblocks cleanup (which allocates what would be a nested
-	 * transaction), we can't use DONTCACHE here because DONTCACHE
-	 * inodes can trigger immediate inactive cleanup of the inode.
-	 *
-	 * If _iget returns -EINVAL or -ENOENT then the child inode number is
-	 * garbage and the directory is corrupt.  If the _iget returns
-	 * -EFSCORRUPTED or -EFSBADCRC then the child is corrupt which is a
-	 *  cross referencing error.  Any other error is an operational error.
-	 */
-	error = xfs_iget(mp, sdc->sc->tp, inum, 0, 0, &ip);
-	if (error == -EINVAL || error == -ENOENT) {
-		error = -EFSCORRUPTED;
-		xchk_fblock_process_error(sdc->sc, XFS_DATA_FORK, 0, &error);
-		goto out;
+		if (ftype != XFS_DIR3_FT_UNKNOWN && ftype != XFS_DIR3_FT_DIR)
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+		return;
 	}
-	if (!xchk_fblock_xref_process_error(sdc->sc, XFS_DATA_FORK, offset,
-			&error))
-		goto out;
 
-	/* Convert mode to the DT_* values that dir_emit uses. */
-	ino_dtype = xfs_dir3_get_dtype(mp,
-			xfs_mode_to_ftype(VFS_I(ip)->i_mode));
-	if (ino_dtype != dtype)
-		xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
-	xfs_irele(ip);
-out:
-	return error;
+	if (xfs_mode_to_ftype(VFS_I(ip)->i_mode) != ftype)
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
 }
 
 /*
  * Scrub a single directory entry.
  *
- * We use the VFS directory iterator (i.e. readdir) to call this
- * function for every directory entry in a directory.  Once we're here,
- * we check the inode number to make sure it's sane, then we check that
- * we can look up this filename.  Finally, we check the ftype.
+ * Check the inode number to make sure it's sane, then we check that we can
+ * look up this filename.  Finally, we check the ftype.
  */
-STATIC bool
+STATIC int
 xchk_dir_actor(
-	struct dir_context	*dir_iter,
-	const char		*name,
-	int			namelen,
-	loff_t			pos,
-	u64			ino,
-	unsigned		type)
+	struct xfs_scrub	*sc,
+	struct xfs_inode	*dp,
+	xfs_dir2_dataptr_t	dapos,
+	const struct xfs_name	*name,
+	xfs_ino_t		ino,
+	void			*priv)
 {
-	struct xfs_mount	*mp;
+	struct xfs_mount	*mp = dp->i_mount;
 	struct xfs_inode	*ip;
-	struct xchk_dir_ctx	*sdc;
-	struct xfs_name		xname;
 	xfs_ino_t		lookup_ino;
 	xfs_dablk_t		offset;
-	bool			checked_ftype = false;
 	int			error = 0;
 
-	sdc = container_of(dir_iter, struct xchk_dir_ctx, dir_iter);
-	ip = sdc->sc->ip;
-	mp = ip->i_mount;
 	offset = xfs_dir2_db_to_da(mp->m_dir_geo,
-			xfs_dir2_dataptr_to_db(mp->m_dir_geo, pos));
+			xfs_dir2_dataptr_to_db(mp->m_dir_geo, dapos));
 
-	if (xchk_should_terminate(sdc->sc, &error))
-		return !error;
+	if (xchk_should_terminate(sc, &error))
+		return error;
 
 	/* Does this inode number make sense? */
 	if (!xfs_verify_dir_ino(mp, ino)) {
-		xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
-		goto out;
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+		return -ECANCELED;
 	}
 
 	/* Does this name make sense? */
-	if (!xfs_dir2_namecheck(name, namelen)) {
-		xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
-		goto out;
+	if (!xfs_dir2_namecheck(name->name, name->len)) {
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+		return -ECANCELED;
 	}
 
-	if (!strncmp(".", name, namelen)) {
+	if (!strncmp(".", name->name, name->len)) {
 		/* If this is "." then check that the inum matches the dir. */
-		if (xfs_has_ftype(mp) && type != DT_DIR)
-			xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
-					offset);
-		checked_ftype = true;
-		if (ino != ip->i_ino)
-			xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
-					offset);
-	} else if (!strncmp("..", name, namelen)) {
+		if (ino != dp->i_ino)
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+	} else if (!strncmp("..", name->name, name->len)) {
 		/*
 		 * If this is ".." in the root inode, check that the inum
 		 * matches this dir.
 		 */
-		if (xfs_has_ftype(mp) && type != DT_DIR)
-			xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
-					offset);
-		checked_ftype = true;
-		if (ip->i_ino == mp->m_sb.sb_rootino && ino != ip->i_ino)
-			xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
-					offset);
+		if (dp->i_ino == mp->m_sb.sb_rootino && ino != dp->i_ino)
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
 	}
 
 	/* Verify that we can look up this name by hash. */
-	xname.name = name;
-	xname.len = namelen;
-	xname.type = XFS_DIR3_FT_UNKNOWN;
-
-	error = xfs_dir_lookup(sdc->sc->tp, ip, &xname, &lookup_ino, NULL);
+	error = xchk_dir_lookup(sc, dp, name, &lookup_ino);
 	/* ENOENT means the hash lookup failed and the dir is corrupt */
 	if (error == -ENOENT)
 		error = -EFSCORRUPTED;
-	if (!xchk_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset,
-			&error))
+	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, offset, &error))
 		goto out;
 	if (lookup_ino != ino) {
-		xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
-		goto out;
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+		return -ECANCELED;
 	}
 
-	/* Verify the file type.  This function absorbs error codes. */
-	if (!checked_ftype) {
-		error = xchk_dir_check_ftype(sdc, offset, lookup_ino, type);
-		if (error)
-			goto out;
-	}
-out:
 	/*
-	 * A negative error code returned here is supposed to cause the
-	 * dir_emit caller (xfs_readdir) to abort the directory iteration
-	 * and return zero to xchk_directory.
+	 * Grab the inode pointed to by the dirent.  We release the inode
+	 * before we cancel the scrub transaction.
+	 *
+	 * If _iget returns -EINVAL or -ENOENT then the child inode number is
+	 * garbage and the directory is corrupt.  If the _iget returns
+	 * -EFSCORRUPTED or -EFSBADCRC then the child is corrupt which is a
+	 *  cross referencing error.  Any other error is an operational error.
 	 */
-	if (error == 0 && sdc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-		return false;
-	return !error;
+	error = xchk_iget(sc, ino, &ip);
+	if (error == -EINVAL || error == -ENOENT) {
+		error = -EFSCORRUPTED;
+		xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error);
+		goto out;
+	}
+	if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, offset, &error))
+		goto out;
+
+	xchk_dir_check_ftype(sc, offset, ip, name->type);
+	xchk_irele(sc, ip);
+out:
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return -ECANCELED;
+	return error;
 }
 
 /* Scrub a directory btree record. */
@@ -201,6 +148,7 @@ xchk_dir_rec(
 	struct xchk_da_btree		*ds,
 	int				level)
 {
+	struct xfs_name			dname = { };
 	struct xfs_da_state_blk		*blk = &ds->state->path.blk[level];
 	struct xfs_mount		*mp = ds->state->mp;
 	struct xfs_inode		*dp = ds->dargs.dp;
@@ -297,7 +245,11 @@ xchk_dir_rec(
 		xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
 		goto out_relse;
 	}
-	calc_hash = xfs_da_hashname(dent->name, dent->namelen);
+
+	/* Does the directory hash match? */
+	dname.name = dent->name;
+	dname.len = dent->namelen;
+	calc_hash = xfs_dir2_hashname(mp, &dname);
 	if (calc_hash != hash)
 		xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
 
@@ -803,14 +755,7 @@ int
 xchk_directory(
 	struct xfs_scrub	*sc)
 {
-	struct xchk_dir_ctx	sdc = {
-		.dir_iter.actor = xchk_dir_actor,
-		.dir_iter.pos = 0,
-		.sc = sc,
-	};
-	size_t			bufsize;
-	loff_t			oldpos;
-	int			error = 0;
+	int			error;
 
 	if (!S_ISDIR(VFS_I(sc->ip)->i_mode))
 		return -ENOENT;
@@ -818,7 +763,7 @@ xchk_directory(
 	/* Plausible size? */
 	if (sc->ip->i_disk_size < xfs_dir2_sf_hdr_size(0)) {
 		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
-		goto out;
+		return 0;
 	}
 
 	/* Check directory tree structure */
@@ -827,7 +772,7 @@ xchk_directory(
 		return error;
 
 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-		return error;
+		return 0;
 
 	/* Check the freespace. */
 	error = xchk_directory_blocks(sc);
@@ -835,44 +780,11 @@ xchk_directory(
 		return error;
 
 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-		return error;
-
-	/*
-	 * Check that every dirent we see can also be looked up by hash.
-	 * Userspace usually asks for a 32k buffer, so we will too.
-	 */
-	bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE,
-			sc->ip->i_disk_size);
-
-	/*
-	 * Look up every name in this directory by hash.
-	 *
-	 * Use the xfs_readdir function to call xchk_dir_actor on
-	 * every directory entry in this directory.  In _actor, we check
-	 * the name, inode number, and ftype (if applicable) of the
-	 * entry.  xfs_readdir uses the VFS filldir functions to provide
-	 * iteration context.
-	 *
-	 * The VFS grabs a read or write lock via i_rwsem before it reads
-	 * or writes to a directory.  If we've gotten this far we've
-	 * already obtained IOLOCK_EXCL, which (since 4.10) is the same as
-	 * getting a write lock on i_rwsem.  Therefore, it is safe for us
-	 * to drop the ILOCK here in order to reuse the _readdir and
-	 * _dir_lookup routines, which do their own ILOCK locking.
-	 */
-	oldpos = 0;
-	sc->ilock_flags &= ~XFS_ILOCK_EXCL;
-	xfs_iunlock(sc->ip, XFS_ILOCK_EXCL);
-	while (true) {
-		error = xfs_readdir(sc->tp, sc->ip, &sdc.dir_iter, bufsize);
-		if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0,
-				&error))
-			goto out;
-		if (oldpos == sdc.dir_iter.pos)
-			break;
-		oldpos = sdc.dir_iter.pos;
-	}
+		return 0;
 
-out:
+	/* Look up every name in this directory by hash. */
+	error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, NULL);
+	if (error == -ECANCELED)
+		error = 0;
 	return error;
 }
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index f0c7f41897b9..faa315be7978 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0+
 /*
- * Copyright (C) 2019 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2019-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -130,6 +130,13 @@ xchk_setup_fscounters(
 	struct xchk_fscounters	*fsc;
 	int			error;
 
+	/*
+	 * If the AGF doesn't track btreeblks, we have to lock the AGF to count
+	 * btree block usage by walking the actual btrees.
+	 */
+	if (!xfs_has_lazysbcount(sc->mp))
+		xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
 	sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS);
 	if (!sc->buf)
 		return -ENOMEM;
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index aa65ec88a0c0..d2b2a1cb6533 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -1,12 +1,14 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2019 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2019-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
 #include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
 #include "xfs_btree.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h
index d0b938d3d028..66a273f8585b 100644
--- a/fs/xfs/scrub/health.h
+++ b/fs/xfs/scrub/health.h
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2019 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2019-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #ifndef __XFS_SCRUB_HEALTH_H__
 #define __XFS_SCRUB_HEALTH_H__
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index e312be7cd375..575f22a02ebe 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2017 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -32,6 +32,8 @@ int
 xchk_setup_ag_iallocbt(
 	struct xfs_scrub	*sc)
 {
+	if (xchk_need_intent_drain(sc))
+		xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
 	return xchk_setup_ag_btree(sc, sc->flags & XCHK_TRY_HARDER);
 }
 
@@ -49,83 +51,237 @@ struct xchk_iallocbt {
 };
 
 /*
- * If we're checking the finobt, cross-reference with the inobt.
- * Otherwise we're checking the inobt; if there is an finobt, make sure
- * we have a record or not depending on freecount.
+ * Does the finobt have a record for this inode with the same hole/free state?
+ * This is a bit complicated because of the following:
+ *
+ * - The finobt need not have a record if all inodes in the inobt record are
+ *   allocated.
+ * - The finobt need not have a record if all inodes in the inobt record are
+ *   free.
+ * - The finobt need not have a record if the inobt record says this is a hole.
+ *   This likely doesn't happen in practice.
  */
-static inline void
-xchk_iallocbt_chunk_xref_other(
+STATIC int
+xchk_inobt_xref_finobt(
+	struct xfs_scrub	*sc,
+	struct xfs_inobt_rec_incore *irec,
+	xfs_agino_t		agino,
+	bool			free,
+	bool			hole)
+{
+	struct xfs_inobt_rec_incore frec;
+	struct xfs_btree_cur	*cur = sc->sa.fino_cur;
+	bool			ffree, fhole;
+	unsigned int		frec_idx, fhole_idx;
+	int			has_record;
+	int			error;
+
+	ASSERT(cur->bc_btnum == XFS_BTNUM_FINO);
+
+	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record);
+	if (error)
+		return error;
+	if (!has_record)
+		goto no_record;
+
+	error = xfs_inobt_get_rec(cur, &frec, &has_record);
+	if (!has_record)
+		return -EFSCORRUPTED;
+
+	if (frec.ir_startino + XFS_INODES_PER_CHUNK <= agino)
+		goto no_record;
+
+	/* There's a finobt record; free and hole status must match. */
+	frec_idx = agino - frec.ir_startino;
+	ffree = frec.ir_free & (1ULL << frec_idx);
+	fhole_idx = frec_idx / XFS_INODES_PER_HOLEMASK_BIT;
+	fhole = frec.ir_holemask & (1U << fhole_idx);
+
+	if (ffree != free)
+		xchk_btree_xref_set_corrupt(sc, cur, 0);
+	if (fhole != hole)
+		xchk_btree_xref_set_corrupt(sc, cur, 0);
+	return 0;
+
+no_record:
+	/* inobt record is fully allocated */
+	if (irec->ir_free == 0)
+		return 0;
+
+	/* inobt record is totally unallocated */
+	if (irec->ir_free == XFS_INOBT_ALL_FREE)
+		return 0;
+
+	/* inobt record says this is a hole */
+	if (hole)
+		return 0;
+
+	/* finobt doesn't care about allocated inodes */
+	if (!free)
+		return 0;
+
+	xchk_btree_xref_set_corrupt(sc, cur, 0);
+	return 0;
+}
+
+/*
+ * Make sure that each inode of this part of an inobt record has the same
+ * sparse and free status as the finobt.
+ */
+STATIC void
+xchk_inobt_chunk_xref_finobt(
 	struct xfs_scrub		*sc,
 	struct xfs_inobt_rec_incore	*irec,
-	xfs_agino_t			agino)
+	xfs_agino_t			agino,
+	unsigned int			nr_inodes)
 {
-	struct xfs_btree_cur		**pcur;
-	bool				has_irec;
+	xfs_agino_t			i;
+	unsigned int			rec_idx;
 	int				error;
 
-	if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT)
-		pcur = &sc->sa.ino_cur;
-	else
-		pcur = &sc->sa.fino_cur;
-	if (!(*pcur))
-		return;
-	error = xfs_ialloc_has_inode_record(*pcur, agino, agino, &has_irec);
-	if (!xchk_should_check_xref(sc, &error, pcur))
+	ASSERT(sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT);
+
+	if (!sc->sa.fino_cur || xchk_skip_xref(sc->sm))
 		return;
-	if (((irec->ir_freecount > 0 && !has_irec) ||
-	     (irec->ir_freecount == 0 && has_irec)))
-		xchk_btree_xref_set_corrupt(sc, *pcur, 0);
+
+	for (i = agino, rec_idx = agino - irec->ir_startino;
+	     i < agino + nr_inodes;
+	     i++, rec_idx++) {
+		bool			free, hole;
+		unsigned int		hole_idx;
+
+		free = irec->ir_free & (1ULL << rec_idx);
+		hole_idx = rec_idx / XFS_INODES_PER_HOLEMASK_BIT;
+		hole = irec->ir_holemask & (1U << hole_idx);
+
+		error = xchk_inobt_xref_finobt(sc, irec, i, free, hole);
+		if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur))
+			return;
+	}
+}
+
+/*
+ * Does the inobt have a record for this inode with the same hole/free state?
+ * The inobt must always have a record if there's a finobt record.
+ */
+STATIC int
+xchk_finobt_xref_inobt(
+	struct xfs_scrub	*sc,
+	struct xfs_inobt_rec_incore *frec,
+	xfs_agino_t		agino,
+	bool			ffree,
+	bool			fhole)
+{
+	struct xfs_inobt_rec_incore irec;
+	struct xfs_btree_cur	*cur = sc->sa.ino_cur;
+	bool			free, hole;
+	unsigned int		rec_idx, hole_idx;
+	int			has_record;
+	int			error;
+
+	ASSERT(cur->bc_btnum == XFS_BTNUM_INO);
+
+	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record);
+	if (error)
+		return error;
+	if (!has_record)
+		goto no_record;
+
+	error = xfs_inobt_get_rec(cur, &irec, &has_record);
+	if (!has_record)
+		return -EFSCORRUPTED;
+
+	if (irec.ir_startino + XFS_INODES_PER_CHUNK <= agino)
+		goto no_record;
+
+	/* There's an inobt record; free and hole status must match. */
+	rec_idx = agino - irec.ir_startino;
+	free = irec.ir_free & (1ULL << rec_idx);
+	hole_idx = rec_idx / XFS_INODES_PER_HOLEMASK_BIT;
+	hole = irec.ir_holemask & (1U << hole_idx);
+
+	if (ffree != free)
+		xchk_btree_xref_set_corrupt(sc, cur, 0);
+	if (fhole != hole)
+		xchk_btree_xref_set_corrupt(sc, cur, 0);
+	return 0;
+
+no_record:
+	/* finobt should never have a record for which the inobt does not */
+	xchk_btree_xref_set_corrupt(sc, cur, 0);
+	return 0;
 }
 
-/* Cross-reference with the other btrees. */
+/*
+ * Make sure that each inode of this part of an finobt record has the same
+ * sparse and free status as the inobt.
+ */
 STATIC void
-xchk_iallocbt_chunk_xref(
+xchk_finobt_chunk_xref_inobt(
 	struct xfs_scrub		*sc,
-	struct xfs_inobt_rec_incore	*irec,
+	struct xfs_inobt_rec_incore	*frec,
 	xfs_agino_t			agino,
-	xfs_agblock_t			agbno,
-	xfs_extlen_t			len)
+	unsigned int			nr_inodes)
 {
-	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+	xfs_agino_t			i;
+	unsigned int			rec_idx;
+	int				error;
+
+	ASSERT(sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT);
+
+	if (!sc->sa.ino_cur || xchk_skip_xref(sc->sm))
 		return;
 
-	xchk_xref_is_used_space(sc, agbno, len);
-	xchk_iallocbt_chunk_xref_other(sc, irec, agino);
-	xchk_xref_is_owned_by(sc, agbno, len, &XFS_RMAP_OINFO_INODES);
-	xchk_xref_is_not_shared(sc, agbno, len);
+	for (i = agino, rec_idx = agino - frec->ir_startino;
+	     i < agino + nr_inodes;
+	     i++, rec_idx++) {
+		bool			ffree, fhole;
+		unsigned int		hole_idx;
+
+		ffree = frec->ir_free & (1ULL << rec_idx);
+		hole_idx = rec_idx / XFS_INODES_PER_HOLEMASK_BIT;
+		fhole = frec->ir_holemask & (1U << hole_idx);
+
+		error = xchk_finobt_xref_inobt(sc, frec, i, ffree, fhole);
+		if (!xchk_should_check_xref(sc, &error, &sc->sa.ino_cur))
+			return;
+	}
 }
 
-/* Is this chunk worth checking? */
+/* Is this chunk worth checking and cross-referencing? */
 STATIC bool
 xchk_iallocbt_chunk(
 	struct xchk_btree		*bs,
 	struct xfs_inobt_rec_incore	*irec,
 	xfs_agino_t			agino,
-	xfs_extlen_t			len)
+	unsigned int			nr_inodes)
 {
+	struct xfs_scrub		*sc = bs->sc;
 	struct xfs_mount		*mp = bs->cur->bc_mp;
 	struct xfs_perag		*pag = bs->cur->bc_ag.pag;
-	xfs_agblock_t			bno;
+	xfs_agblock_t			agbno;
+	xfs_extlen_t			len;
 
-	bno = XFS_AGINO_TO_AGBNO(mp, agino);
+	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+	len = XFS_B_TO_FSB(mp, nr_inodes * mp->m_sb.sb_inodesize);
 
-	if (!xfs_verify_agbext(pag, bno, len))
+	if (!xfs_verify_agbext(pag, agbno, len))
 		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
 
-	xchk_iallocbt_chunk_xref(bs->sc, irec, agino, bno, len);
+	if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return false;
 
+	xchk_xref_is_used_space(sc, agbno, len);
+	if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT)
+		xchk_inobt_chunk_xref_finobt(sc, irec, agino, nr_inodes);
+	else
+		xchk_finobt_chunk_xref_inobt(sc, irec, agino, nr_inodes);
+	xchk_xref_is_only_owned_by(sc, agbno, len, &XFS_RMAP_OINFO_INODES);
+	xchk_xref_is_not_shared(sc, agbno, len);
+	xchk_xref_is_not_cow_staging(sc, agbno, len);
 	return true;
 }
 
-/* Count the number of free inodes. */
-static unsigned int
-xchk_iallocbt_freecount(
-	xfs_inofree_t			freemask)
-{
-	BUILD_BUG_ON(sizeof(freemask) != sizeof(__u64));
-	return hweight64(freemask);
-}
-
 /*
  * Check that an inode's allocation status matches ir_free in the inobt
  * record.  First we try querying the in-core inode state, and if the inode
@@ -272,7 +428,7 @@ xchk_iallocbt_check_cluster(
 		return 0;
 	}
 
-	xchk_xref_is_owned_by(bs->sc, agbno, M_IGEO(mp)->blocks_per_cluster,
+	xchk_xref_is_only_owned_by(bs->sc, agbno, M_IGEO(mp)->blocks_per_cluster,
 			&XFS_RMAP_OINFO_INODES);
 
 	/* Grab the inode cluster buffer. */
@@ -420,36 +576,22 @@ xchk_iallocbt_rec(
 	const union xfs_btree_rec	*rec)
 {
 	struct xfs_mount		*mp = bs->cur->bc_mp;
-	struct xfs_perag		*pag = bs->cur->bc_ag.pag;
 	struct xchk_iallocbt		*iabt = bs->private;
 	struct xfs_inobt_rec_incore	irec;
 	uint64_t			holes;
 	xfs_agino_t			agino;
-	xfs_extlen_t			len;
 	int				holecount;
 	int				i;
 	int				error = 0;
-	unsigned int			real_freecount;
 	uint16_t			holemask;
 
 	xfs_inobt_btrec_to_irec(mp, rec, &irec);
-
-	if (irec.ir_count > XFS_INODES_PER_CHUNK ||
-	    irec.ir_freecount > XFS_INODES_PER_CHUNK)
-		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
-
-	real_freecount = irec.ir_freecount +
-			(XFS_INODES_PER_CHUNK - irec.ir_count);
-	if (real_freecount != xchk_iallocbt_freecount(irec.ir_free))
+	if (xfs_inobt_check_irec(bs->cur, &irec) != NULL) {
 		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+		return 0;
+	}
 
 	agino = irec.ir_startino;
-	/* Record has to be properly aligned within the AG. */
-	if (!xfs_verify_agino(pag, agino) ||
-	    !xfs_verify_agino(pag, agino + XFS_INODES_PER_CHUNK - 1)) {
-		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
-		goto out;
-	}
 
 	xchk_iallocbt_rec_alignment(bs, &irec);
 	if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
@@ -459,12 +601,11 @@ xchk_iallocbt_rec(
 
 	/* Handle non-sparse inodes */
 	if (!xfs_inobt_issparse(irec.ir_holemask)) {
-		len = XFS_B_TO_FSB(mp,
-				XFS_INODES_PER_CHUNK * mp->m_sb.sb_inodesize);
 		if (irec.ir_count != XFS_INODES_PER_CHUNK)
 			xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
 
-		if (!xchk_iallocbt_chunk(bs, &irec, agino, len))
+		if (!xchk_iallocbt_chunk(bs, &irec, agino,
+					XFS_INODES_PER_CHUNK))
 			goto out;
 		goto check_clusters;
 	}
@@ -472,8 +613,6 @@ xchk_iallocbt_rec(
 	/* Check each chunk of a sparse inode cluster. */
 	holemask = irec.ir_holemask;
 	holecount = 0;
-	len = XFS_B_TO_FSB(mp,
-			XFS_INODES_PER_HOLEMASK_BIT * mp->m_sb.sb_inodesize);
 	holes = ~xfs_inobt_irec_to_allocmask(&irec);
 	if ((holes & irec.ir_free) != holes ||
 	    irec.ir_freecount > irec.ir_count)
@@ -482,8 +621,9 @@ xchk_iallocbt_rec(
 	for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; i++) {
 		if (holemask & 1)
 			holecount += XFS_INODES_PER_HOLEMASK_BIT;
-		else if (!xchk_iallocbt_chunk(bs, &irec, agino, len))
-			break;
+		else if (!xchk_iallocbt_chunk(bs, &irec, agino,
+					XFS_INODES_PER_HOLEMASK_BIT))
+			goto out;
 		holemask >>= 1;
 		agino += XFS_INODES_PER_HOLEMASK_BIT;
 	}
@@ -493,6 +633,9 @@ xchk_iallocbt_rec(
 		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
 
 check_clusters:
+	if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		goto out;
+
 	error = xchk_iallocbt_check_clusters(bs, &irec);
 	if (error)
 		goto out;
@@ -622,18 +765,18 @@ xchk_xref_inode_check(
 	xfs_agblock_t		agbno,
 	xfs_extlen_t		len,
 	struct xfs_btree_cur	**icur,
-	bool			should_have_inodes)
+	enum xbtree_recpacking	expected)
 {
-	bool			has_inodes;
+	enum xbtree_recpacking	outcome;
 	int			error;
 
 	if (!(*icur) || xchk_skip_xref(sc->sm))
 		return;
 
-	error = xfs_ialloc_has_inodes_at_extent(*icur, agbno, len, &has_inodes);
+	error = xfs_ialloc_has_inodes_at_extent(*icur, agbno, len, &outcome);
 	if (!xchk_should_check_xref(sc, &error, icur))
 		return;
-	if (has_inodes != should_have_inodes)
+	if (outcome != expected)
 		xchk_btree_xref_set_corrupt(sc, *icur, 0);
 }
 
@@ -644,8 +787,10 @@ xchk_xref_is_not_inode_chunk(
 	xfs_agblock_t		agbno,
 	xfs_extlen_t		len)
 {
-	xchk_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, false);
-	xchk_xref_inode_check(sc, agbno, len, &sc->sa.fino_cur, false);
+	xchk_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur,
+			XBTREE_RECPACKING_EMPTY);
+	xchk_xref_inode_check(sc, agbno, len, &sc->sa.fino_cur,
+			XBTREE_RECPACKING_EMPTY);
 }
 
 /* xref check that the extent is covered by inodes */
@@ -655,5 +800,6 @@ xchk_xref_is_inode_chunk(
 	xfs_agblock_t		agbno,
 	xfs_extlen_t		len)
 {
-	xchk_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, true);
+	xchk_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur,
+			XBTREE_RECPACKING_FULL);
 }
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 7a2f38e5202c..3e1e02e340a6 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2017 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -11,8 +11,11 @@
 #include "xfs_mount.h"
 #include "xfs_btree.h"
 #include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_ag.h"
 #include "xfs_inode.h"
 #include "xfs_ialloc.h"
+#include "xfs_icache.h"
 #include "xfs_da_format.h"
 #include "xfs_reflink.h"
 #include "xfs_rmap.h"
@@ -20,45 +23,176 @@
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/* Prepare the attached inode for scrubbing. */
+static inline int
+xchk_prepare_iscrub(
+	struct xfs_scrub	*sc)
+{
+	int			error;
+
+	sc->ilock_flags = XFS_IOLOCK_EXCL;
+	xfs_ilock(sc->ip, sc->ilock_flags);
+
+	error = xchk_trans_alloc(sc, 0);
+	if (error)
+		return error;
+
+	sc->ilock_flags |= XFS_ILOCK_EXCL;
+	xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+	return 0;
+}
+
+/* Install this scrub-by-handle inode and prepare it for scrubbing. */
+static inline int
+xchk_install_handle_iscrub(
+	struct xfs_scrub	*sc,
+	struct xfs_inode	*ip)
+{
+	int			error;
+
+	error = xchk_install_handle_inode(sc, ip);
+	if (error)
+		return error;
+
+	return xchk_prepare_iscrub(sc);
+}
 
 /*
- * Grab total control of the inode metadata.  It doesn't matter here if
- * the file data is still changing; exclusive access to the metadata is
- * the goal.
+ * Grab total control of the inode metadata.  In the best case, we grab the
+ * incore inode and take all locks on it.  If the incore inode cannot be
+ * constructed due to corruption problems, lock the AGI so that we can single
+ * step the loading process to fix everything that can go wrong.
  */
 int
 xchk_setup_inode(
 	struct xfs_scrub	*sc)
 {
+	struct xfs_imap		imap;
+	struct xfs_inode	*ip;
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_inode	*ip_in = XFS_I(file_inode(sc->file));
+	struct xfs_buf		*agi_bp;
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino);
 	int			error;
 
+	if (xchk_need_intent_drain(sc))
+		xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
+	/* We want to scan the opened inode, so lock it and exit. */
+	if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) {
+		sc->ip = ip_in;
+		return xchk_prepare_iscrub(sc);
+	}
+
+	/* Reject internal metadata files and obviously bad inode numbers. */
+	if (xfs_internal_inum(mp, sc->sm->sm_ino))
+		return -ENOENT;
+	if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
+		return -ENOENT;
+
+	/* Try a regular untrusted iget. */
+	error = xchk_iget(sc, sc->sm->sm_ino, &ip);
+	if (!error)
+		return xchk_install_handle_iscrub(sc, ip);
+	if (error == -ENOENT)
+		return error;
+	if (error != -EFSCORRUPTED && error != -EFSBADCRC && error != -EINVAL)
+		goto out_error;
+
 	/*
-	 * Try to get the inode.  If the verifiers fail, we try again
-	 * in raw mode.
+	 * EINVAL with IGET_UNTRUSTED probably means one of several things:
+	 * userspace gave us an inode number that doesn't correspond to fs
+	 * space; the inode btree lacks a record for this inode; or there is
+	 * a record, and it says this inode is free.
+	 *
+	 * EFSCORRUPTED/EFSBADCRC could mean that the inode was mappable, but
+	 * some other metadata corruption (e.g. inode forks) prevented
+	 * instantiation of the incore inode.  Or it could mean the inobt is
+	 * corrupt.
+	 *
+	 * We want to look up this inode in the inobt directly to distinguish
+	 * three different scenarios: (1) the inobt says the inode is free,
+	 * in which case there's nothing to do; (2) the inobt is corrupt so we
+	 * should flag the corruption and exit to userspace to let it fix the
+	 * inobt; and (3) the inobt says the inode is allocated, but loading it
+	 * failed due to corruption.
+	 *
+	 * Allocate a transaction and grab the AGI to prevent inobt activity in
+	 * this AG.  Retry the iget in case someone allocated a new inode after
+	 * the first iget failed.
 	 */
-	error = xchk_get_inode(sc);
-	switch (error) {
-	case 0:
-		break;
-	case -EFSCORRUPTED:
-	case -EFSBADCRC:
-		return xchk_trans_alloc(sc, 0);
-	default:
-		return error;
+	error = xchk_trans_alloc(sc, 0);
+	if (error)
+		goto out_error;
+
+	error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip);
+	if (error == 0) {
+		/* Actually got the incore inode, so install it and proceed. */
+		xchk_trans_cancel(sc);
+		return xchk_install_handle_iscrub(sc, ip);
+	}
+	if (error == -ENOENT)
+		goto out_gone;
+	if (error != -EFSCORRUPTED && error != -EFSBADCRC && error != -EINVAL)
+		goto out_cancel;
+
+	/* Ensure that we have protected against inode allocation/freeing. */
+	if (agi_bp == NULL) {
+		ASSERT(agi_bp != NULL);
+		error = -ECANCELED;
+		goto out_cancel;
 	}
 
-	/* Got the inode, lock it and we're ready to go. */
-	sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
-	xfs_ilock(sc->ip, sc->ilock_flags);
-	error = xchk_trans_alloc(sc, 0);
+	/*
+	 * Untrusted iget failed a second time.  Let's try an inobt lookup.
+	 * If the inobt doesn't think this is an allocated inode then we'll
+	 * return ENOENT to signal that the check can be skipped.
+	 *
+	 * If the lookup signals corruption, we'll mark this inode corrupt and
+	 * exit to userspace.  There's little chance of fixing anything until
+	 * the inobt is straightened out, but there's nothing we can do here.
+	 *
+	 * If the lookup encounters a runtime error, exit to userspace.
+	 */
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino));
+	if (!pag) {
+		error = -EFSCORRUPTED;
+		goto out_cancel;
+	}
+
+	error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap,
+			XFS_IGET_UNTRUSTED);
+	xfs_perag_put(pag);
+	if (error == -EINVAL || error == -ENOENT)
+		goto out_gone;
 	if (error)
-		goto out;
-	sc->ilock_flags |= XFS_ILOCK_EXCL;
-	xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+		goto out_cancel;
 
-out:
-	/* scrub teardown will unlock and release the inode for us */
+	/*
+	 * The lookup succeeded.  Chances are the ondisk inode is corrupt and
+	 * preventing iget from reading it.  Retain the scrub transaction and
+	 * the AGI buffer to prevent anyone from allocating or freeing inodes.
+	 * This ensures that we preserve the inconsistency between the inobt
+	 * saying the inode is allocated and the icache being unable to load
+	 * the inode until we can flag the corruption in xchk_inode.  The
+	 * scrub function has to note the corruption, since we're not really
+	 * supposed to do that from the setup function.
+	 */
+	return 0;
+
+out_cancel:
+	xchk_trans_cancel(sc);
+out_error:
+	trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
+			error, __return_address);
 	return error;
+out_gone:
+	/* The file is gone, so there's nothing to check. */
+	xchk_trans_cancel(sc);
+	return -ENOENT;
 }
 
 /* Inode core */
@@ -553,8 +687,9 @@ xchk_inode_xref(
 
 	xchk_xref_is_used_space(sc, agbno, 1);
 	xchk_inode_xref_finobt(sc, ino);
-	xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_INODES);
+	xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_INODES);
 	xchk_xref_is_not_shared(sc, agbno, 1);
+	xchk_xref_is_not_cow_staging(sc, agbno, 1);
 	xchk_inode_xref_bmap(sc, dip);
 
 out_free:
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index d8dff3fd8053..58d5dfb7ea21 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2017 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -16,6 +16,7 @@
 #include "xfs_dir2_priv.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
+#include "scrub/readdir.h"
 
 /* Set us up to scrub parents. */
 int
@@ -30,122 +31,93 @@ xchk_setup_parent(
 /* Look for an entry in a parent pointing to this inode. */
 
 struct xchk_parent_ctx {
-	struct dir_context	dc;
 	struct xfs_scrub	*sc;
-	xfs_ino_t		ino;
 	xfs_nlink_t		nlink;
-	bool			cancelled;
 };
 
 /* Look for a single entry in a directory pointing to an inode. */
-STATIC bool
+STATIC int
 xchk_parent_actor(
-	struct dir_context	*dc,
-	const char		*name,
-	int			namelen,
-	loff_t			pos,
-	u64			ino,
-	unsigned		type)
+	struct xfs_scrub	*sc,
+	struct xfs_inode	*dp,
+	xfs_dir2_dataptr_t	dapos,
+	const struct xfs_name	*name,
+	xfs_ino_t		ino,
+	void			*priv)
 {
-	struct xchk_parent_ctx	*spc;
+	struct xchk_parent_ctx	*spc = priv;
 	int			error = 0;
 
-	spc = container_of(dc, struct xchk_parent_ctx, dc);
-	if (spc->ino == ino)
+	/* Does this name make sense? */
+	if (!xfs_dir2_namecheck(name->name, name->len))
+		error = -EFSCORRUPTED;
+	if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
+		return error;
+
+	if (sc->ip->i_ino == ino)
 		spc->nlink++;
 
-	/*
-	 * If we're facing a fatal signal, bail out.  Store the cancellation
-	 * status separately because the VFS readdir code squashes error codes
-	 * into short directory reads.
-	 */
 	if (xchk_should_terminate(spc->sc, &error))
-		spc->cancelled = true;
+		return error;
 
-	return !error;
+	return 0;
 }
 
-/* Count the number of dentries in the parent dir that point to this inode. */
-STATIC int
-xchk_parent_count_parent_dentries(
-	struct xfs_scrub	*sc,
-	struct xfs_inode	*parent,
-	xfs_nlink_t		*nlink)
+/*
+ * Try to lock a parent directory for checking dirents.  Returns the inode
+ * flags for the locks we now hold, or zero if we failed.
+ */
+STATIC unsigned int
+xchk_parent_ilock_dir(
+	struct xfs_inode	*dp)
 {
-	struct xchk_parent_ctx	spc = {
-		.dc.actor	= xchk_parent_actor,
-		.ino		= sc->ip->i_ino,
-		.sc		= sc,
-	};
-	size_t			bufsize;
-	loff_t			oldpos;
-	uint			lock_mode;
-	int			error = 0;
+	if (!xfs_ilock_nowait(dp, XFS_ILOCK_SHARED))
+		return 0;
 
-	/*
-	 * If there are any blocks, read-ahead block 0 as we're almost
-	 * certain to have the next operation be a read there.  This is
-	 * how we guarantee that the parent's extent map has been loaded,
-	 * if there is one.
-	 */
-	lock_mode = xfs_ilock_data_map_shared(parent);
-	if (parent->i_df.if_nextents > 0)
-		error = xfs_dir3_data_readahead(parent, 0, 0);
-	xfs_iunlock(parent, lock_mode);
-	if (error)
-		return error;
+	if (!xfs_need_iread_extents(&dp->i_df))
+		return XFS_ILOCK_SHARED;
 
-	/*
-	 * Iterate the parent dir to confirm that there is
-	 * exactly one entry pointing back to the inode being
-	 * scanned.
-	 */
-	bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE,
-			parent->i_disk_size);
-	oldpos = 0;
-	while (true) {
-		error = xfs_readdir(sc->tp, parent, &spc.dc, bufsize);
-		if (error)
-			goto out;
-		if (spc.cancelled) {
-			error = -EAGAIN;
-			goto out;
-		}
-		if (oldpos == spc.dc.pos)
-			break;
-		oldpos = spc.dc.pos;
-	}
-	*nlink = spc.nlink;
-out:
-	return error;
+	xfs_iunlock(dp, XFS_ILOCK_SHARED);
+
+	if (!xfs_ilock_nowait(dp, XFS_ILOCK_EXCL))
+		return 0;
+
+	return XFS_ILOCK_EXCL;
 }
 
 /*
- * Given the inode number of the alleged parent of the inode being
- * scrubbed, try to validate that the parent has exactly one directory
- * entry pointing back to the inode being scrubbed.
+ * Given the inode number of the alleged parent of the inode being scrubbed,
+ * try to validate that the parent has exactly one directory entry pointing
+ * back to the inode being scrubbed.  Returns -EAGAIN if we need to revalidate
+ * the dotdot entry.
  */
 STATIC int
 xchk_parent_validate(
 	struct xfs_scrub	*sc,
-	xfs_ino_t		dnum,
-	bool			*try_again)
+	xfs_ino_t		parent_ino)
 {
+	struct xchk_parent_ctx	spc = {
+		.sc		= sc,
+		.nlink		= 0,
+	};
 	struct xfs_mount	*mp = sc->mp;
 	struct xfs_inode	*dp = NULL;
 	xfs_nlink_t		expected_nlink;
-	xfs_nlink_t		nlink;
+	unsigned int		lock_mode;
 	int			error = 0;
 
-	*try_again = false;
-
-	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-		goto out;
+	/* Is this the root dir?  Then '..' must point to itself. */
+	if (sc->ip == mp->m_rootip) {
+		if (sc->ip->i_ino != mp->m_sb.sb_rootino ||
+		    sc->ip->i_ino != parent_ino)
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+		return 0;
+	}
 
 	/* '..' must not point to ourselves. */
-	if (sc->ip->i_ino == dnum) {
+	if (sc->ip->i_ino == parent_ino) {
 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
-		goto out;
+		return 0;
 	}
 
 	/*
@@ -155,106 +127,51 @@ xchk_parent_validate(
 	expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1;
 
 	/*
-	 * Grab this parent inode.  We release the inode before we
-	 * cancel the scrub transaction.  Since we're don't know a
-	 * priori that releasing the inode won't trigger eofblocks
-	 * cleanup (which allocates what would be a nested transaction)
-	 * if the parent pointer erroneously points to a file, we
-	 * can't use DONTCACHE here because DONTCACHE inodes can trigger
-	 * immediate inactive cleanup of the inode.
+	 * Grab the parent directory inode.  This must be released before we
+	 * cancel the scrub transaction.
 	 *
 	 * If _iget returns -EINVAL or -ENOENT then the parent inode number is
 	 * garbage and the directory is corrupt.  If the _iget returns
 	 * -EFSCORRUPTED or -EFSBADCRC then the parent is corrupt which is a
 	 *  cross referencing error.  Any other error is an operational error.
 	 */
-	error = xfs_iget(mp, sc->tp, dnum, XFS_IGET_UNTRUSTED, 0, &dp);
+	error = xchk_iget(sc, parent_ino, &dp);
 	if (error == -EINVAL || error == -ENOENT) {
 		error = -EFSCORRUPTED;
 		xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error);
-		goto out;
+		return error;
 	}
 	if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
-		goto out;
+		return error;
 	if (dp == sc->ip || !S_ISDIR(VFS_I(dp)->i_mode)) {
 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
 		goto out_rele;
 	}
 
-	/*
-	 * We prefer to keep the inode locked while we lock and search
-	 * its alleged parent for a forward reference.  If we can grab
-	 * the iolock, validate the pointers and we're done.  We must
-	 * use nowait here to avoid an ABBA deadlock on the parent and
-	 * the child inodes.
-	 */
-	if (xfs_ilock_nowait(dp, XFS_IOLOCK_SHARED)) {
-		error = xchk_parent_count_parent_dentries(sc, dp, &nlink);
-		if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0,
-				&error))
-			goto out_unlock;
-		if (nlink != expected_nlink)
-			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
-		goto out_unlock;
-	}
-
-	/*
-	 * The game changes if we get here.  We failed to lock the parent,
-	 * so we're going to try to verify both pointers while only holding
-	 * one lock so as to avoid deadlocking with something that's actually
-	 * trying to traverse down the directory tree.
-	 */
-	xfs_iunlock(sc->ip, sc->ilock_flags);
-	sc->ilock_flags = 0;
-	error = xchk_ilock_inverted(dp, XFS_IOLOCK_SHARED);
-	if (error)
+	lock_mode = xchk_parent_ilock_dir(dp);
+	if (!lock_mode) {
+		xfs_iunlock(sc->ip, XFS_ILOCK_EXCL);
+		xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+		error = -EAGAIN;
 		goto out_rele;
+	}
 
-	/* Go looking for our dentry. */
-	error = xchk_parent_count_parent_dentries(sc, dp, &nlink);
+	/* Look for a directory entry in the parent pointing to the child. */
+	error = xchk_dir_walk(sc, dp, xchk_parent_actor, &spc);
 	if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
 		goto out_unlock;
 
-	/* Drop the parent lock, relock this inode. */
-	xfs_iunlock(dp, XFS_IOLOCK_SHARED);
-	error = xchk_ilock_inverted(sc->ip, XFS_IOLOCK_EXCL);
-	if (error)
-		goto out_rele;
-	sc->ilock_flags = XFS_IOLOCK_EXCL;
-
-	/*
-	 * If we're an unlinked directory, the parent /won't/ have a link
-	 * to us.  Otherwise, it should have one link.  We have to re-set
-	 * it here because we dropped the lock on sc->ip.
-	 */
-	expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1;
-
-	/* Look up '..' to see if the inode changed. */
-	error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &dnum, NULL);
-	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
-		goto out_rele;
-
-	/* Drat, parent changed.  Try again! */
-	if (dnum != dp->i_ino) {
-		xfs_irele(dp);
-		*try_again = true;
-		return 0;
-	}
-	xfs_irele(dp);
-
 	/*
-	 * '..' didn't change, so check that there was only one entry
-	 * for us in the parent.
+	 * Ensure that the parent has as many links to the child as the child
+	 * thinks it has to the parent.
 	 */
-	if (nlink != expected_nlink)
+	if (spc.nlink != expected_nlink)
 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
-	return error;
 
 out_unlock:
-	xfs_iunlock(dp, XFS_IOLOCK_SHARED);
+	xfs_iunlock(dp, lock_mode);
 out_rele:
-	xfs_irele(dp);
-out:
+	xchk_irele(sc, dp);
 	return error;
 }
 
@@ -264,9 +181,7 @@ xchk_parent(
 	struct xfs_scrub	*sc)
 {
 	struct xfs_mount	*mp = sc->mp;
-	xfs_ino_t		dnum;
-	bool			try_again;
-	int			tries = 0;
+	xfs_ino_t		parent_ino;
 	int			error = 0;
 
 	/*
@@ -279,56 +194,29 @@ xchk_parent(
 	/* We're not a special inode, are we? */
 	if (!xfs_verify_dir_ino(mp, sc->ip->i_ino)) {
 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
-		goto out;
+		return 0;
 	}
 
-	/*
-	 * The VFS grabs a read or write lock via i_rwsem before it reads
-	 * or writes to a directory.  If we've gotten this far we've
-	 * already obtained IOLOCK_EXCL, which (since 4.10) is the same as
-	 * getting a write lock on i_rwsem.  Therefore, it is safe for us
-	 * to drop the ILOCK here in order to do directory lookups.
-	 */
-	sc->ilock_flags &= ~(XFS_ILOCK_EXCL | XFS_MMAPLOCK_EXCL);
-	xfs_iunlock(sc->ip, XFS_ILOCK_EXCL | XFS_MMAPLOCK_EXCL);
-
-	/* Look up '..' */
-	error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &dnum, NULL);
-	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
-		goto out;
-	if (!xfs_verify_dir_ino(mp, dnum)) {
-		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
-		goto out;
-	}
+	do {
+		if (xchk_should_terminate(sc, &error))
+			break;
 
-	/* Is this the root dir?  Then '..' must point to itself. */
-	if (sc->ip == mp->m_rootip) {
-		if (sc->ip->i_ino != mp->m_sb.sb_rootino ||
-		    sc->ip->i_ino != dnum)
+		/* Look up '..' */
+		error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot,
+				&parent_ino);
+		if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+			return error;
+		if (!xfs_verify_dir_ino(mp, parent_ino)) {
 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
-		goto out;
-	}
+			return 0;
+		}
 
-	do {
-		error = xchk_parent_validate(sc, dnum, &try_again);
-		if (error)
-			goto out;
-	} while (try_again && ++tries < 20);
+		/*
+		 * Check that the dotdot entry points to a parent directory
+		 * containing a dirent pointing to this subdirectory.
+		 */
+		error = xchk_parent_validate(sc, parent_ino);
+	} while (error == -EAGAIN);
 
-	/*
-	 * We gave it our best shot but failed, so mark this scrub
-	 * incomplete.  Userspace can decide if it wants to try again.
-	 */
-	if (try_again && tries == 20)
-		xchk_set_incomplete(sc);
-out:
-	/*
-	 * If we failed to lock the parent inode even after a retry, just mark
-	 * this scrub incomplete and return.
-	 */
-	if ((sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) {
-		error = 0;
-		xchk_set_incomplete(sc);
-	}
 	return error;
 }
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 9eeac8565394..e6caa358cbda 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2017 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -53,6 +53,9 @@ xchk_setup_quota(
 	if (!xfs_this_quota_on(sc->mp, dqtype))
 		return -ENOENT;
 
+	if (xchk_need_intent_drain(sc))
+		xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
 	error = xchk_setup_fs(sc);
 	if (error)
 		return error;
diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c
new file mode 100644
index 000000000000..e51c1544be63
--- /dev/null
+++ b/fs/xfs/scrub/readdir.c
@@ -0,0 +1,375 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_trace.h"
+#include "xfs_bmap.h"
+#include "xfs_trans.h"
+#include "xfs_error.h"
+#include "scrub/scrub.h"
+#include "scrub/readdir.h"
+
+/* Call a function for every entry in a shortform directory. */
+STATIC int
+xchk_dir_walk_sf(
+	struct xfs_scrub	*sc,
+	struct xfs_inode	*dp,
+	xchk_dirent_fn		dirent_fn,
+	void			*priv)
+{
+	struct xfs_name		name = {
+		.name		= ".",
+		.len		= 1,
+		.type		= XFS_DIR3_FT_DIR,
+	};
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_da_geometry	*geo = mp->m_dir_geo;
+	struct xfs_dir2_sf_entry *sfep;
+	struct xfs_dir2_sf_hdr	*sfp;
+	xfs_ino_t		ino;
+	xfs_dir2_dataptr_t	dapos;
+	unsigned int		i;
+	int			error;
+
+	ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+
+	sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data;
+
+	/* dot entry */
+	dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+			geo->data_entry_offset);
+
+	error = dirent_fn(sc, dp, dapos, &name, dp->i_ino, priv);
+	if (error)
+		return error;
+
+	/* dotdot entry */
+	dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+			geo->data_entry_offset +
+			xfs_dir2_data_entsize(mp, sizeof(".") - 1));
+	ino = xfs_dir2_sf_get_parent_ino(sfp);
+	name.name = "..";
+	name.len = 2;
+
+	error = dirent_fn(sc, dp, dapos, &name, ino, priv);
+	if (error)
+		return error;
+
+	/* iterate everything else */
+	sfep = xfs_dir2_sf_firstentry(sfp);
+	for (i = 0; i < sfp->count; i++) {
+		dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+				xfs_dir2_sf_get_offset(sfep));
+		ino = xfs_dir2_sf_get_ino(mp, sfp, sfep);
+		name.name = sfep->name;
+		name.len = sfep->namelen;
+		name.type = xfs_dir2_sf_get_ftype(mp, sfep);
+
+		error = dirent_fn(sc, dp, dapos, &name, ino, priv);
+		if (error)
+			return error;
+
+		sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
+	}
+
+	return 0;
+}
+
+/* Call a function for every entry in a block directory. */
+STATIC int
+xchk_dir_walk_block(
+	struct xfs_scrub	*sc,
+	struct xfs_inode	*dp,
+	xchk_dirent_fn		dirent_fn,
+	void			*priv)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_da_geometry	*geo = mp->m_dir_geo;
+	struct xfs_buf		*bp;
+	unsigned int		off, next_off, end;
+	int			error;
+
+	error = xfs_dir3_block_read(sc->tp, dp, &bp);
+	if (error)
+		return error;
+
+	/* Walk each directory entry. */
+	end = xfs_dir3_data_end_offset(geo, bp->b_addr);
+	for (off = geo->data_entry_offset; off < end; off = next_off) {
+		struct xfs_name			name = { };
+		struct xfs_dir2_data_unused	*dup = bp->b_addr + off;
+		struct xfs_dir2_data_entry	*dep = bp->b_addr + off;
+		xfs_ino_t			ino;
+		xfs_dir2_dataptr_t		dapos;
+
+		/* Skip an empty entry. */
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			next_off = off + be16_to_cpu(dup->length);
+			continue;
+		}
+
+		/* Otherwise, find the next entry and report it. */
+		next_off = off + xfs_dir2_data_entsize(mp, dep->namelen);
+		if (next_off > end)
+			break;
+
+		dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, off);
+		ino = be64_to_cpu(dep->inumber);
+		name.name = dep->name;
+		name.len = dep->namelen;
+		name.type = xfs_dir2_data_get_ftype(mp, dep);
+
+		error = dirent_fn(sc, dp, dapos, &name, ino, priv);
+		if (error)
+			break;
+	}
+
+	xfs_trans_brelse(sc->tp, bp);
+	return error;
+}
+
+/* Read a leaf-format directory buffer. */
+STATIC int
+xchk_read_leaf_dir_buf(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	struct xfs_da_geometry	*geo,
+	xfs_dir2_off_t		*curoff,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_irec	map;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK);
+	xfs_dablk_t		last_da;
+	xfs_dablk_t		map_off;
+	xfs_dir2_off_t		new_off;
+
+	*bpp = NULL;
+
+	/*
+	 * Look for mapped directory blocks at or above the current offset.
+	 * Truncate down to the nearest directory block to start the scanning
+	 * operation.
+	 */
+	last_da = xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET);
+	map_off = xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, *curoff));
+
+	if (!xfs_iext_lookup_extent(dp, ifp, map_off, &icur, &map))
+		return 0;
+	if (map.br_startoff >= last_da)
+		return 0;
+	xfs_trim_extent(&map, map_off, last_da - map_off);
+
+	/* Read the directory block of that first mapping. */
+	new_off = xfs_dir2_da_to_byte(geo, map.br_startoff);
+	if (new_off > *curoff)
+		*curoff = new_off;
+
+	return xfs_dir3_data_read(tp, dp, map.br_startoff, 0, bpp);
+}
+
+/* Call a function for every entry in a leaf directory. */
+STATIC int
+xchk_dir_walk_leaf(
+	struct xfs_scrub	*sc,
+	struct xfs_inode	*dp,
+	xchk_dirent_fn		dirent_fn,
+	void			*priv)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_da_geometry	*geo = mp->m_dir_geo;
+	struct xfs_buf		*bp = NULL;
+	xfs_dir2_off_t		curoff = 0;
+	unsigned int		offset = 0;
+	int			error;
+
+	/* Iterate every directory offset in this directory. */
+	while (curoff < XFS_DIR2_LEAF_OFFSET) {
+		struct xfs_name			name = { };
+		struct xfs_dir2_data_unused	*dup;
+		struct xfs_dir2_data_entry	*dep;
+		xfs_ino_t			ino;
+		unsigned int			length;
+		xfs_dir2_dataptr_t		dapos;
+
+		/*
+		 * If we have no buffer, or we're off the end of the
+		 * current buffer, need to get another one.
+		 */
+		if (!bp || offset >= geo->blksize) {
+			if (bp) {
+				xfs_trans_brelse(sc->tp, bp);
+				bp = NULL;
+			}
+
+			error = xchk_read_leaf_dir_buf(sc->tp, dp, geo, &curoff,
+					&bp);
+			if (error || !bp)
+				break;
+
+			/*
+			 * Find our position in the block.
+			 */
+			offset = geo->data_entry_offset;
+			curoff += geo->data_entry_offset;
+		}
+
+		/* Skip an empty entry. */
+		dup = bp->b_addr + offset;
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			length = be16_to_cpu(dup->length);
+			offset += length;
+			curoff += length;
+			continue;
+		}
+
+		/* Otherwise, find the next entry and report it. */
+		dep = bp->b_addr + offset;
+		length = xfs_dir2_data_entsize(mp, dep->namelen);
+
+		dapos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
+		ino = be64_to_cpu(dep->inumber);
+		name.name = dep->name;
+		name.len = dep->namelen;
+		name.type = xfs_dir2_data_get_ftype(mp, dep);
+
+		error = dirent_fn(sc, dp, dapos, &name, ino, priv);
+		if (error)
+			break;
+
+		/* Advance to the next entry. */
+		offset += length;
+		curoff += length;
+	}
+
+	if (bp)
+		xfs_trans_brelse(sc->tp, bp);
+	return error;
+}
+
+/*
+ * Call a function for every entry in a directory.
+ *
+ * Callers must hold the ILOCK.  File types are XFS_DIR3_FT_*.
+ */
+int
+xchk_dir_walk(
+	struct xfs_scrub	*sc,
+	struct xfs_inode	*dp,
+	xchk_dirent_fn		dirent_fn,
+	void			*priv)
+{
+	struct xfs_da_args	args = {
+		.dp		= dp,
+		.geo		= dp->i_mount->m_dir_geo,
+		.trans		= sc->tp,
+	};
+	bool			isblock;
+	int			error;
+
+	if (xfs_is_shutdown(dp->i_mount))
+		return -EIO;
+
+	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+	ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+
+	if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL)
+		return xchk_dir_walk_sf(sc, dp, dirent_fn, priv);
+
+	/* dir2 functions require that the data fork is loaded */
+	error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK);
+	if (error)
+		return error;
+
+	error = xfs_dir2_isblock(&args, &isblock);
+	if (error)
+		return error;
+
+	if (isblock)
+		return xchk_dir_walk_block(sc, dp, dirent_fn, priv);
+
+	return xchk_dir_walk_leaf(sc, dp, dirent_fn, priv);
+}
+
+/*
+ * Look up the inode number for an exact name in a directory.
+ *
+ * Callers must hold the ILOCK.  File types are XFS_DIR3_FT_*.  Names are not
+ * checked for correctness.
+ */
+int
+xchk_dir_lookup(
+	struct xfs_scrub	*sc,
+	struct xfs_inode	*dp,
+	const struct xfs_name	*name,
+	xfs_ino_t		*ino)
+{
+	struct xfs_da_args	args = {
+		.dp		= dp,
+		.geo		= dp->i_mount->m_dir_geo,
+		.trans		= sc->tp,
+		.name		= name->name,
+		.namelen	= name->len,
+		.filetype	= name->type,
+		.hashval	= xfs_dir2_hashname(dp->i_mount, name),
+		.whichfork	= XFS_DATA_FORK,
+		.op_flags	= XFS_DA_OP_OKNOENT,
+	};
+	bool			isblock, isleaf;
+	int			error;
+
+	if (xfs_is_shutdown(dp->i_mount))
+		return -EIO;
+
+	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+	ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+
+	if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+		error = xfs_dir2_sf_lookup(&args);
+		goto out_check_rval;
+	}
+
+	/* dir2 functions require that the data fork is loaded */
+	error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK);
+	if (error)
+		return error;
+
+	error = xfs_dir2_isblock(&args, &isblock);
+	if (error)
+		return error;
+
+	if (isblock) {
+		error = xfs_dir2_block_lookup(&args);
+		goto out_check_rval;
+	}
+
+	error = xfs_dir2_isleaf(&args, &isleaf);
+	if (error)
+		return error;
+
+	if (isleaf) {
+		error = xfs_dir2_leaf_lookup(&args);
+		goto out_check_rval;
+	}
+
+	error = xfs_dir2_node_lookup(&args);
+
+out_check_rval:
+	if (error == -EEXIST)
+		error = 0;
+	if (!error)
+		*ino = args.inumber;
+	return error;
+}
diff --git a/fs/xfs/scrub/readdir.h b/fs/xfs/scrub/readdir.h
new file mode 100644
index 000000000000..55787f4df123
--- /dev/null
+++ b/fs/xfs/scrub/readdir.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_READDIR_H__
+#define __XFS_SCRUB_READDIR_H__
+
+typedef int (*xchk_dirent_fn)(struct xfs_scrub *sc, struct xfs_inode *dp,
+		xfs_dir2_dataptr_t dapos, const struct xfs_name *name,
+		xfs_ino_t ino, void *priv);
+
+int xchk_dir_walk(struct xfs_scrub *sc, struct xfs_inode *dp,
+		xchk_dirent_fn dirent_fn, void *priv);
+
+int xchk_dir_lookup(struct xfs_scrub *sc, struct xfs_inode *dp,
+		const struct xfs_name *name, xfs_ino_t *ino);
+
+#endif /* __XFS_SCRUB_READDIR_H__ */
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index d9c1b3cea4a5..304ea1e1bfb0 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -1,21 +1,22 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2017 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
 #include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_ag.h"
 #include "xfs_btree.h"
 #include "xfs_rmap.h"
 #include "xfs_refcount.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/btree.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
-#include "xfs_ag.h"
+#include "scrub/trace.h"
 
 /*
  * Set us up to scrub reference count btrees.
@@ -24,6 +25,8 @@ int
 xchk_setup_ag_refcountbt(
 	struct xfs_scrub	*sc)
 {
+	if (xchk_need_intent_drain(sc))
+		xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
 	return xchk_setup_ag_btree(sc, false);
 }
 
@@ -300,8 +303,10 @@ xchk_refcountbt_xref_rmap(
 		goto out_free;
 
 	xchk_refcountbt_process_rmap_fragments(&refchk);
-	if (irec->rc_refcount != refchk.seen)
+	if (irec->rc_refcount != refchk.seen) {
+		trace_xchk_refcount_incorrect(sc->sa.pag, irec, refchk.seen);
 		xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+	}
 
 out_free:
 	list_for_each_entry_safe(frag, n, &refchk.fragments, list) {
@@ -325,6 +330,107 @@ xchk_refcountbt_xref(
 	xchk_refcountbt_xref_rmap(sc, irec);
 }
 
+struct xchk_refcbt_records {
+	/* Previous refcount record. */
+	struct xfs_refcount_irec prev_rec;
+
+	/* The next AG block where we aren't expecting shared extents. */
+	xfs_agblock_t		next_unshared_agbno;
+
+	/* Number of CoW blocks we expect. */
+	xfs_agblock_t		cow_blocks;
+
+	/* Was the last record a shared or CoW staging extent? */
+	enum xfs_refc_domain	prev_domain;
+};
+
+STATIC int
+xchk_refcountbt_rmap_check_gap(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	xfs_agblock_t			*next_bno = priv;
+
+	if (*next_bno != NULLAGBLOCK && rec->rm_startblock < *next_bno)
+		return -ECANCELED;
+
+	*next_bno = rec->rm_startblock + rec->rm_blockcount;
+	return 0;
+}
+
+/*
+ * Make sure that a gap in the reference count records does not correspond to
+ * overlapping records (i.e. shared extents) in the reverse mappings.
+ */
+static inline void
+xchk_refcountbt_xref_gaps(
+	struct xfs_scrub	*sc,
+	struct xchk_refcbt_records *rrc,
+	xfs_agblock_t		bno)
+{
+	struct xfs_rmap_irec	low;
+	struct xfs_rmap_irec	high;
+	xfs_agblock_t		next_bno = NULLAGBLOCK;
+	int			error;
+
+	if (bno <= rrc->next_unshared_agbno || !sc->sa.rmap_cur ||
+            xchk_skip_xref(sc->sm))
+		return;
+
+	memset(&low, 0, sizeof(low));
+	low.rm_startblock = rrc->next_unshared_agbno;
+	memset(&high, 0xFF, sizeof(high));
+	high.rm_startblock = bno - 1;
+
+	error = xfs_rmap_query_range(sc->sa.rmap_cur, &low, &high,
+			xchk_refcountbt_rmap_check_gap, &next_bno);
+	if (error == -ECANCELED)
+		xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+	else
+		xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur);
+}
+
+static inline bool
+xchk_refcount_mergeable(
+	struct xchk_refcbt_records	*rrc,
+	const struct xfs_refcount_irec	*r2)
+{
+	const struct xfs_refcount_irec	*r1 = &rrc->prev_rec;
+
+	/* Ignore if prev_rec is not yet initialized. */
+	if (r1->rc_blockcount > 0)
+		return false;
+
+	if (r1->rc_domain != r2->rc_domain)
+		return false;
+	if (r1->rc_startblock + r1->rc_blockcount != r2->rc_startblock)
+		return false;
+	if (r1->rc_refcount != r2->rc_refcount)
+		return false;
+	if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount >
+			MAXREFCEXTLEN)
+		return false;
+
+	return true;
+}
+
+/* Flag failures for records that could be merged. */
+STATIC void
+xchk_refcountbt_check_mergeable(
+	struct xchk_btree		*bs,
+	struct xchk_refcbt_records	*rrc,
+	const struct xfs_refcount_irec	*irec)
+{
+	if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	if (xchk_refcount_mergeable(rrc, irec))
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+	memcpy(&rrc->prev_rec, irec, sizeof(struct xfs_refcount_irec));
+}
+
 /* Scrub a refcountbt record. */
 STATIC int
 xchk_refcountbt_rec(
@@ -332,27 +438,37 @@ xchk_refcountbt_rec(
 	const union xfs_btree_rec *rec)
 {
 	struct xfs_refcount_irec irec;
-	xfs_agblock_t		*cow_blocks = bs->private;
-	struct xfs_perag	*pag = bs->cur->bc_ag.pag;
+	struct xchk_refcbt_records *rrc = bs->private;
 
 	xfs_refcount_btrec_to_irec(rec, &irec);
-
-	/* Check the domain and refcount are not incompatible. */
-	if (!xfs_refcount_check_domain(&irec))
+	if (xfs_refcount_check_irec(bs->cur, &irec) != NULL) {
 		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+		return 0;
+	}
 
 	if (irec.rc_domain == XFS_REFC_DOMAIN_COW)
-		(*cow_blocks) += irec.rc_blockcount;
-
-	/* Check the extent. */
-	if (!xfs_verify_agbext(pag, irec.rc_startblock, irec.rc_blockcount))
-		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+		rrc->cow_blocks += irec.rc_blockcount;
 
-	if (irec.rc_refcount == 0)
+	/* Shared records always come before CoW records. */
+	if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED &&
+	    rrc->prev_domain == XFS_REFC_DOMAIN_COW)
 		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+	rrc->prev_domain = irec.rc_domain;
 
+	xchk_refcountbt_check_mergeable(bs, rrc, &irec);
 	xchk_refcountbt_xref(bs->sc, &irec);
 
+	/*
+	 * If this is a record for a shared extent, check that all blocks
+	 * between the previous record and this one have at most one reverse
+	 * mapping.
+	 */
+	if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED) {
+		xchk_refcountbt_xref_gaps(bs->sc, rrc, irec.rc_startblock);
+		rrc->next_unshared_agbno = irec.rc_startblock +
+					   irec.rc_blockcount;
+	}
+
 	return 0;
 }
 
@@ -394,15 +510,25 @@ int
 xchk_refcountbt(
 	struct xfs_scrub	*sc)
 {
-	xfs_agblock_t		cow_blocks = 0;
+	struct xchk_refcbt_records rrc = {
+		.cow_blocks		= 0,
+		.next_unshared_agbno	= 0,
+		.prev_domain		= XFS_REFC_DOMAIN_SHARED,
+	};
 	int			error;
 
 	error = xchk_btree(sc, sc->sa.refc_cur, xchk_refcountbt_rec,
-			&XFS_RMAP_OINFO_REFC, &cow_blocks);
+			&XFS_RMAP_OINFO_REFC, &rrc);
 	if (error)
 		return error;
 
-	xchk_refcount_xref_rmap(sc, cow_blocks);
+	/*
+	 * Check that all blocks between the last refcount > 1 record and the
+	 * end of the AG have at most one reverse mapping.
+	 */
+	xchk_refcountbt_xref_gaps(sc, &rrc, sc->mp->m_sb.sb_agblocks);
+
+	xchk_refcount_xref_rmap(sc, rrc.cow_blocks);
 
 	return 0;
 }
@@ -458,16 +584,37 @@ xchk_xref_is_not_shared(
 	xfs_agblock_t		agbno,
 	xfs_extlen_t		len)
 {
-	bool			shared;
+	enum xbtree_recpacking	outcome;
+	int			error;
+
+	if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	error = xfs_refcount_has_records(sc->sa.refc_cur,
+			XFS_REFC_DOMAIN_SHARED, agbno, len, &outcome);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur))
+		return;
+	if (outcome != XBTREE_RECPACKING_EMPTY)
+		xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
+}
+
+/* xref check that the extent is not being used for CoW staging. */
+void
+xchk_xref_is_not_cow_staging(
+	struct xfs_scrub	*sc,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		len)
+{
+	enum xbtree_recpacking	outcome;
 	int			error;
 
 	if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm))
 		return;
 
-	error = xfs_refcount_has_record(sc->sa.refc_cur, XFS_REFC_DOMAIN_SHARED,
-			agbno, len, &shared);
+	error = xfs_refcount_has_records(sc->sa.refc_cur, XFS_REFC_DOMAIN_COW,
+			agbno, len, &outcome);
 	if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur))
 		return;
-	if (shared)
+	if (outcome != XBTREE_RECPACKING_EMPTY)
 		xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
 }
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 1b71174ec0d6..ac6d8803e660 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2018 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -60,6 +60,9 @@ xrep_attempt(
 		sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
 		sc->flags |= XREP_ALREADY_FIXED;
 		return -EAGAIN;
+	case -ECHRNG:
+		sc->flags |= XCHK_NEED_DRAIN;
+		return -EAGAIN;
 	case -EDEADLOCK:
 		/* Tell the caller to try again having grabbed all the locks. */
 		if (!(sc->flags & XCHK_TRY_HARDER)) {
@@ -442,6 +445,30 @@ xrep_init_btblock(
  * buffers associated with @bitmap.
  */
 
+static int
+xrep_invalidate_block(
+	uint64_t		fsbno,
+	void			*priv)
+{
+	struct xfs_scrub	*sc = priv;
+	struct xfs_buf		*bp;
+	int			error;
+
+	/* Skip AG headers and post-EOFS blocks */
+	if (!xfs_verify_fsbno(sc->mp, fsbno))
+		return 0;
+
+	error = xfs_buf_incore(sc->mp->m_ddev_targp,
+			XFS_FSB_TO_DADDR(sc->mp, fsbno),
+			XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp);
+	if (error)
+		return 0;
+
+	xfs_trans_bjoin(sc->tp, bp);
+	xfs_trans_binval(sc->tp, bp);
+	return 0;
+}
+
 /*
  * Invalidate buffers for per-AG btree blocks we're dumping.  This function
  * is not intended for use with file data repairs; we have bunmapi for that.
@@ -451,11 +478,6 @@ xrep_invalidate_blocks(
 	struct xfs_scrub	*sc,
 	struct xbitmap		*bitmap)
 {
-	struct xbitmap_range	*bmr;
-	struct xbitmap_range	*n;
-	struct xfs_buf		*bp;
-	xfs_fsblock_t		fsbno;
-
 	/*
 	 * For each block in each extent, see if there's an incore buffer for
 	 * exactly that block; if so, invalidate it.  The buffer cache only
@@ -464,23 +486,7 @@ xrep_invalidate_blocks(
 	 * because we never own those; and if we can't TRYLOCK the buffer we
 	 * assume it's owned by someone else.
 	 */
-	for_each_xbitmap_block(fsbno, bmr, n, bitmap) {
-		int		error;
-
-		/* Skip AG headers and post-EOFS blocks */
-		if (!xfs_verify_fsbno(sc->mp, fsbno))
-			continue;
-		error = xfs_buf_incore(sc->mp->m_ddev_targp,
-				XFS_FSB_TO_DADDR(sc->mp, fsbno),
-				XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp);
-		if (error)
-			continue;
-
-		xfs_trans_bjoin(sc->tp, bp);
-		xfs_trans_binval(sc->tp, bp);
-	}
-
-	return 0;
+	return xbitmap_walk_bits(bitmap, xrep_invalidate_block, sc);
 }
 
 /* Ensure the freelist is the correct size. */
@@ -501,6 +507,15 @@ xrep_fix_freelist(
 			can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
 }
 
+/* Information about reaping extents after a repair. */
+struct xrep_reap_state {
+	struct xfs_scrub		*sc;
+
+	/* Reverse mapping owner and metadata reservation type. */
+	const struct xfs_owner_info	*oinfo;
+	enum xfs_ag_resv_type		resv;
+};
+
 /*
  * Put a block back on the AGFL.
  */
@@ -545,17 +560,23 @@ xrep_put_freelist(
 /* Dispose of a single block. */
 STATIC int
 xrep_reap_block(
-	struct xfs_scrub		*sc,
-	xfs_fsblock_t			fsbno,
-	const struct xfs_owner_info	*oinfo,
-	enum xfs_ag_resv_type		resv)
+	uint64_t			fsbno,
+	void				*priv)
 {
+	struct xrep_reap_state		*rs = priv;
+	struct xfs_scrub		*sc = rs->sc;
 	struct xfs_btree_cur		*cur;
 	struct xfs_buf			*agf_bp = NULL;
 	xfs_agblock_t			agbno;
 	bool				has_other_rmap;
 	int				error;
 
+	ASSERT(sc->ip != NULL ||
+	       XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
+	trace_xrep_dispose_btree_extent(sc->mp,
+			XFS_FSB_TO_AGNO(sc->mp, fsbno),
+			XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1);
+
 	agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
 	ASSERT(XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
 
@@ -574,7 +595,8 @@ xrep_reap_block(
 	cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, sc->sa.pag);
 
 	/* Can we find any other rmappings? */
-	error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap);
+	error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo,
+			&has_other_rmap);
 	xfs_btree_del_cursor(cur, error);
 	if (error)
 		goto out_free;
@@ -594,11 +616,12 @@ xrep_reap_block(
 	 */
 	if (has_other_rmap)
 		error = xfs_rmap_free(sc->tp, agf_bp, sc->sa.pag, agbno,
-					1, oinfo);
-	else if (resv == XFS_AG_RESV_AGFL)
+					1, rs->oinfo);
+	else if (rs->resv == XFS_AG_RESV_AGFL)
 		error = xrep_put_freelist(sc, agbno);
 	else
-		error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv);
+		error = xfs_free_extent(sc->tp, sc->sa.pag, agbno, 1, rs->oinfo,
+				rs->resv);
 	if (agf_bp != sc->sa.agf_bp)
 		xfs_trans_brelse(sc->tp, agf_bp);
 	if (error)
@@ -622,26 +645,15 @@ xrep_reap_extents(
 	const struct xfs_owner_info	*oinfo,
 	enum xfs_ag_resv_type		type)
 {
-	struct xbitmap_range		*bmr;
-	struct xbitmap_range		*n;
-	xfs_fsblock_t			fsbno;
-	int				error = 0;
+	struct xrep_reap_state		rs = {
+		.sc			= sc,
+		.oinfo			= oinfo,
+		.resv			= type,
+	};
 
 	ASSERT(xfs_has_rmapbt(sc->mp));
 
-	for_each_xbitmap_block(fsbno, bmr, n, bitmap) {
-		ASSERT(sc->ip != NULL ||
-		       XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
-		trace_xrep_dispose_btree_extent(sc->mp,
-				XFS_FSB_TO_AGNO(sc->mp, fsbno),
-				XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1);
-
-		error = xrep_reap_block(sc, fsbno, oinfo, type);
-		if (error)
-			break;
-	}
-
-	return error;
+	return xbitmap_walk_bits(bitmap, xrep_reap_block, &rs);
 }
 
 /*
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 840f74ec431c..dce791c679ee 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2018 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #ifndef __XFS_SCRUB_REPAIR_H__
 #define __XFS_SCRUB_REPAIR_H__
@@ -31,6 +31,7 @@ int xrep_init_btblock(struct xfs_scrub *sc, xfs_fsblock_t fsb,
 		const struct xfs_buf_ops *ops);
 
 struct xbitmap;
+struct xagb_bitmap;
 
 int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink);
 int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xbitmap *btlist);
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index 229826b2e1c0..d29a26ecddd6 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -1,21 +1,29 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2017 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
 #include "xfs_format.h"
+#include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
+#include "xfs_trans.h"
 #include "xfs_btree.h"
 #include "xfs_rmap.h"
 #include "xfs_refcount.h"
+#include "xfs_ag.h"
+#include "xfs_bit.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_refcount_btree.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/btree.h"
-#include "xfs_ag.h"
+#include "scrub/bitmap.h"
 
 /*
  * Set us up to scrub reverse mapping btrees.
@@ -24,11 +32,39 @@ int
 xchk_setup_ag_rmapbt(
 	struct xfs_scrub	*sc)
 {
+	if (xchk_need_intent_drain(sc))
+		xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
 	return xchk_setup_ag_btree(sc, false);
 }
 
 /* Reverse-mapping scrubber. */
 
+struct xchk_rmap {
+	/*
+	 * The furthest-reaching of the rmapbt records that we've already
+	 * processed.  This enables us to detect overlapping records for space
+	 * allocations that cannot be shared.
+	 */
+	struct xfs_rmap_irec	overlap_rec;
+
+	/*
+	 * The previous rmapbt record, so that we can check for two records
+	 * that could be one.
+	 */
+	struct xfs_rmap_irec	prev_rec;
+
+	/* Bitmaps containing all blocks for each type of AG metadata. */
+	struct xagb_bitmap	fs_owned;
+	struct xagb_bitmap	log_owned;
+	struct xagb_bitmap	ag_owned;
+	struct xagb_bitmap	inobt_owned;
+	struct xagb_bitmap	refcbt_owned;
+
+	/* Did we complete the AG space metadata bitmaps? */
+	bool			bitmaps_complete;
+};
+
 /* Cross-reference a rmap against the refcount btree. */
 STATIC void
 xchk_rmapbt_xref_refc(
@@ -84,80 +120,415 @@ xchk_rmapbt_xref(
 		xchk_rmapbt_xref_refc(sc, irec);
 }
 
-/* Scrub an rmapbt record. */
-STATIC int
-xchk_rmapbt_rec(
-	struct xchk_btree	*bs,
-	const union xfs_btree_rec *rec)
+/*
+ * Check for bogus UNWRITTEN flags in the rmapbt node block keys.
+ *
+ * In reverse mapping records, the file mapping extent state
+ * (XFS_RMAP_OFF_UNWRITTEN) is a record attribute, not a key field.  It is not
+ * involved in lookups in any way.  In older kernels, the functions that
+ * convert rmapbt records to keys forgot to filter out the extent state bit,
+ * even though the key comparison functions have filtered the flag correctly.
+ * If we spot an rmap key with the unwritten bit set in rm_offset, we should
+ * mark the btree as needing optimization to rebuild the btree without those
+ * flags.
+ */
+STATIC void
+xchk_rmapbt_check_unwritten_in_keyflags(
+	struct xchk_btree	*bs)
 {
-	struct xfs_mount	*mp = bs->cur->bc_mp;
-	struct xfs_rmap_irec	irec;
-	struct xfs_perag	*pag = bs->cur->bc_ag.pag;
-	bool			non_inode;
-	bool			is_unwritten;
-	bool			is_bmbt;
-	bool			is_attr;
-	int			error;
+	struct xfs_scrub	*sc = bs->sc;
+	struct xfs_btree_cur	*cur = bs->cur;
+	struct xfs_btree_block	*keyblock;
+	union xfs_btree_key	*lkey, *hkey;
+	__be64			badflag = cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN);
+	unsigned int		level;
 
-	error = xfs_rmap_btrec_to_irec(rec, &irec);
-	if (!xchk_btree_process_error(bs->sc, bs->cur, 0, &error))
-		goto out;
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_PREEN)
+		return;
+
+	for (level = 1; level < cur->bc_nlevels; level++) {
+		struct xfs_buf	*bp;
+		unsigned int	ptr;
+
+		/* Only check the first time we've seen this node block. */
+		if (cur->bc_levels[level].ptr > 1)
+			continue;
+
+		keyblock = xfs_btree_get_block(cur, level, &bp);
+		for (ptr = 1; ptr <= be16_to_cpu(keyblock->bb_numrecs); ptr++) {
+			lkey = xfs_btree_key_addr(cur, ptr, keyblock);
+
+			if (lkey->rmap.rm_offset & badflag) {
+				xchk_btree_set_preen(sc, cur, level);
+				break;
+			}
+
+			hkey = xfs_btree_high_key_addr(cur, ptr, keyblock);
+			if (hkey->rmap.rm_offset & badflag) {
+				xchk_btree_set_preen(sc, cur, level);
+				break;
+			}
+		}
+	}
+}
+
+static inline bool
+xchk_rmapbt_is_shareable(
+	struct xfs_scrub		*sc,
+	const struct xfs_rmap_irec	*irec)
+{
+	if (!xfs_has_reflink(sc->mp))
+		return false;
+	if (XFS_RMAP_NON_INODE_OWNER(irec->rm_owner))
+		return false;
+	if (irec->rm_flags & (XFS_RMAP_BMBT_BLOCK | XFS_RMAP_ATTR_FORK |
+			      XFS_RMAP_UNWRITTEN))
+		return false;
+	return true;
+}
+
+/* Flag failures for records that overlap but cannot. */
+STATIC void
+xchk_rmapbt_check_overlapping(
+	struct xchk_btree		*bs,
+	struct xchk_rmap		*cr,
+	const struct xfs_rmap_irec	*irec)
+{
+	xfs_agblock_t			pnext, inext;
+
+	if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	/* No previous record? */
+	if (cr->overlap_rec.rm_blockcount == 0)
+		goto set_prev;
+
+	/* Do overlap_rec and irec overlap? */
+	pnext = cr->overlap_rec.rm_startblock + cr->overlap_rec.rm_blockcount;
+	if (pnext <= irec->rm_startblock)
+		goto set_prev;
 
-	/* Check extent. */
-	if (irec.rm_startblock + irec.rm_blockcount <= irec.rm_startblock)
+	/* Overlap is only allowed if both records are data fork mappings. */
+	if (!xchk_rmapbt_is_shareable(bs->sc, &cr->overlap_rec) ||
+	    !xchk_rmapbt_is_shareable(bs->sc, irec))
 		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
 
-	if (irec.rm_owner == XFS_RMAP_OWN_FS) {
+	/* Save whichever rmap record extends furthest. */
+	inext = irec->rm_startblock + irec->rm_blockcount;
+	if (pnext > inext)
+		return;
+
+set_prev:
+	memcpy(&cr->overlap_rec, irec, sizeof(struct xfs_rmap_irec));
+}
+
+/* Decide if two reverse-mapping records can be merged. */
+static inline bool
+xchk_rmap_mergeable(
+	struct xchk_rmap		*cr,
+	const struct xfs_rmap_irec	*r2)
+{
+	const struct xfs_rmap_irec	*r1 = &cr->prev_rec;
+
+	/* Ignore if prev_rec is not yet initialized. */
+	if (cr->prev_rec.rm_blockcount == 0)
+		return false;
+
+	if (r1->rm_owner != r2->rm_owner)
+		return false;
+	if (r1->rm_startblock + r1->rm_blockcount != r2->rm_startblock)
+		return false;
+	if ((unsigned long long)r1->rm_blockcount + r2->rm_blockcount >
+	    XFS_RMAP_LEN_MAX)
+		return false;
+	if (XFS_RMAP_NON_INODE_OWNER(r2->rm_owner))
+		return true;
+	/* must be an inode owner below here */
+	if (r1->rm_flags != r2->rm_flags)
+		return false;
+	if (r1->rm_flags & XFS_RMAP_BMBT_BLOCK)
+		return true;
+	return r1->rm_offset + r1->rm_blockcount == r2->rm_offset;
+}
+
+/* Flag failures for records that could be merged. */
+STATIC void
+xchk_rmapbt_check_mergeable(
+	struct xchk_btree		*bs,
+	struct xchk_rmap		*cr,
+	const struct xfs_rmap_irec	*irec)
+{
+	if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return;
+
+	if (xchk_rmap_mergeable(cr, irec))
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+	memcpy(&cr->prev_rec, irec, sizeof(struct xfs_rmap_irec));
+}
+
+/* Compare an rmap for AG metadata against the metadata walk. */
+STATIC int
+xchk_rmapbt_mark_bitmap(
+	struct xchk_btree		*bs,
+	struct xchk_rmap		*cr,
+	const struct xfs_rmap_irec	*irec)
+{
+	struct xfs_scrub		*sc = bs->sc;
+	struct xagb_bitmap		*bmp = NULL;
+	xfs_extlen_t			fsbcount = irec->rm_blockcount;
+
+	/*
+	 * Skip corrupt records.  It is essential that we detect records in the
+	 * btree that cannot overlap but do, flag those as CORRUPT, and skip
+	 * the bitmap comparison to avoid generating false XCORRUPT reports.
+	 */
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return 0;
+
+	/*
+	 * If the AG metadata walk didn't complete, there's no point in
+	 * comparing against partial results.
+	 */
+	if (!cr->bitmaps_complete)
+		return 0;
+
+	switch (irec->rm_owner) {
+	case XFS_RMAP_OWN_FS:
+		bmp = &cr->fs_owned;
+		break;
+	case XFS_RMAP_OWN_LOG:
+		bmp = &cr->log_owned;
+		break;
+	case XFS_RMAP_OWN_AG:
+		bmp = &cr->ag_owned;
+		break;
+	case XFS_RMAP_OWN_INOBT:
+		bmp = &cr->inobt_owned;
+		break;
+	case XFS_RMAP_OWN_REFC:
+		bmp = &cr->refcbt_owned;
+		break;
+	}
+
+	if (!bmp)
+		return 0;
+
+	if (xagb_bitmap_test(bmp, irec->rm_startblock, &fsbcount)) {
 		/*
-		 * xfs_verify_agbno returns false for static fs metadata.
-		 * Since that only exists at the start of the AG, validate
-		 * that by hand.
+		 * The start of this reverse mapping corresponds to a set
+		 * region in the bitmap.  If the mapping covers more area than
+		 * the set region, then it covers space that wasn't found by
+		 * the AG metadata walk.
 		 */
-		if (irec.rm_startblock != 0 ||
-		    irec.rm_blockcount != XFS_AGFL_BLOCK(mp) + 1)
-			xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+		if (fsbcount < irec->rm_blockcount)
+			xchk_btree_xref_set_corrupt(bs->sc,
+					bs->sc->sa.rmap_cur, 0);
 	} else {
 		/*
-		 * Otherwise we must point somewhere past the static metadata
-		 * but before the end of the FS.  Run the regular check.
+		 * The start of this reverse mapping does not correspond to a
+		 * completely set region in the bitmap.  The region wasn't
+		 * fully set by walking the AG metadata, so this is a
+		 * cross-referencing corruption.
 		 */
-		if (!xfs_verify_agbno(pag, irec.rm_startblock) ||
-		    !xfs_verify_agbno(pag, irec.rm_startblock +
-				irec.rm_blockcount - 1))
-			xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+		xchk_btree_xref_set_corrupt(bs->sc, bs->sc->sa.rmap_cur, 0);
 	}
 
-	/* Check flags. */
-	non_inode = XFS_RMAP_NON_INODE_OWNER(irec.rm_owner);
-	is_bmbt = irec.rm_flags & XFS_RMAP_BMBT_BLOCK;
-	is_attr = irec.rm_flags & XFS_RMAP_ATTR_FORK;
-	is_unwritten = irec.rm_flags & XFS_RMAP_UNWRITTEN;
+	/* Unset the region so that we can detect missing rmap records. */
+	return xagb_bitmap_clear(bmp, irec->rm_startblock, irec->rm_blockcount);
+}
 
-	if (is_bmbt && irec.rm_offset != 0)
-		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+/* Scrub an rmapbt record. */
+STATIC int
+xchk_rmapbt_rec(
+	struct xchk_btree	*bs,
+	const union xfs_btree_rec *rec)
+{
+	struct xchk_rmap	*cr = bs->private;
+	struct xfs_rmap_irec	irec;
 
-	if (non_inode && irec.rm_offset != 0)
+	if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL ||
+	    xfs_rmap_check_irec(bs->cur, &irec) != NULL) {
 		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+		return 0;
+	}
 
-	if (is_unwritten && (is_bmbt || non_inode || is_attr))
-		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+	xchk_rmapbt_check_unwritten_in_keyflags(bs);
+	xchk_rmapbt_check_mergeable(bs, cr, &irec);
+	xchk_rmapbt_check_overlapping(bs, cr, &irec);
+	xchk_rmapbt_xref(bs->sc, &irec);
 
-	if (non_inode && (is_bmbt || is_unwritten || is_attr))
-		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+	return xchk_rmapbt_mark_bitmap(bs, cr, &irec);
+}
 
-	if (!non_inode) {
-		if (!xfs_verify_ino(mp, irec.rm_owner))
-			xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
-	} else {
-		/* Non-inode owner within the magic values? */
-		if (irec.rm_owner <= XFS_RMAP_OWN_MIN ||
-		    irec.rm_owner > XFS_RMAP_OWN_FS)
-			xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+/* Add an AGFL block to the rmap list. */
+STATIC int
+xchk_rmapbt_walk_agfl(
+	struct xfs_mount	*mp,
+	xfs_agblock_t		agbno,
+	void			*priv)
+{
+	struct xagb_bitmap	*bitmap = priv;
+
+	return xagb_bitmap_set(bitmap, agbno, 1);
+}
+
+/*
+ * Set up bitmaps mapping all the AG metadata to compare with the rmapbt
+ * records.
+ *
+ * Grab our own btree cursors here if the scrub setup function didn't give us a
+ * btree cursor due to reports of poor health.  We need to find out if the
+ * rmapbt disagrees with primary metadata btrees to tag the rmapbt as being
+ * XCORRUPT.
+ */
+STATIC int
+xchk_rmapbt_walk_ag_metadata(
+	struct xfs_scrub	*sc,
+	struct xchk_rmap	*cr)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_buf		*agfl_bp;
+	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
+	struct xfs_btree_cur	*cur;
+	int			error;
+
+	/* OWN_FS: AG headers */
+	error = xagb_bitmap_set(&cr->fs_owned, XFS_SB_BLOCK(mp),
+			XFS_AGFL_BLOCK(mp) - XFS_SB_BLOCK(mp) + 1);
+	if (error)
+		goto out;
+
+	/* OWN_LOG: Internal log */
+	if (xfs_ag_contains_log(mp, sc->sa.pag->pag_agno)) {
+		error = xagb_bitmap_set(&cr->log_owned,
+				XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart),
+				mp->m_sb.sb_logblocks);
+		if (error)
+			goto out;
+	}
+
+	/* OWN_AG: bnobt, cntbt, rmapbt, and AGFL */
+	cur = sc->sa.bno_cur;
+	if (!cur)
+		cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+				sc->sa.pag, XFS_BTNUM_BNO);
+	error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur);
+	if (cur != sc->sa.bno_cur)
+		xfs_btree_del_cursor(cur, error);
+	if (error)
+		goto out;
+
+	cur = sc->sa.cnt_cur;
+	if (!cur)
+		cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+				sc->sa.pag, XFS_BTNUM_CNT);
+	error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur);
+	if (cur != sc->sa.cnt_cur)
+		xfs_btree_del_cursor(cur, error);
+	if (error)
+		goto out;
+
+	error = xagb_bitmap_set_btblocks(&cr->ag_owned, sc->sa.rmap_cur);
+	if (error)
+		goto out;
+
+	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
+	if (error)
+		goto out;
+
+	error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xchk_rmapbt_walk_agfl,
+			&cr->ag_owned);
+	xfs_trans_brelse(sc->tp, agfl_bp);
+	if (error)
+		goto out;
+
+	/* OWN_INOBT: inobt, finobt */
+	cur = sc->sa.ino_cur;
+	if (!cur)
+		cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, sc->sa.agi_bp,
+				XFS_BTNUM_INO);
+	error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur);
+	if (cur != sc->sa.ino_cur)
+		xfs_btree_del_cursor(cur, error);
+	if (error)
+		goto out;
+
+	if (xfs_has_finobt(sc->mp)) {
+		cur = sc->sa.fino_cur;
+		if (!cur)
+			cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp,
+					sc->sa.agi_bp, XFS_BTNUM_FINO);
+		error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur);
+		if (cur != sc->sa.fino_cur)
+			xfs_btree_del_cursor(cur, error);
+		if (error)
+			goto out;
+	}
+
+	/* OWN_REFC: refcountbt */
+	if (xfs_has_reflink(sc->mp)) {
+		cur = sc->sa.refc_cur;
+		if (!cur)
+			cur = xfs_refcountbt_init_cursor(sc->mp, sc->tp,
+					sc->sa.agf_bp, sc->sa.pag);
+		error = xagb_bitmap_set_btblocks(&cr->refcbt_owned, cur);
+		if (cur != sc->sa.refc_cur)
+			xfs_btree_del_cursor(cur, error);
+		if (error)
+			goto out;
 	}
 
-	xchk_rmapbt_xref(bs->sc, &irec);
 out:
-	return error;
+	/*
+	 * If there's an error, set XFAIL and disable the bitmap
+	 * cross-referencing checks, but proceed with the scrub anyway.
+	 */
+	if (error)
+		xchk_btree_xref_process_error(sc, sc->sa.rmap_cur,
+				sc->sa.rmap_cur->bc_nlevels - 1, &error);
+	else
+		cr->bitmaps_complete = true;
+	return 0;
+}
+
+/*
+ * Check for set regions in the bitmaps; if there are any, the rmap records do
+ * not describe all the AG metadata.
+ */
+STATIC void
+xchk_rmapbt_check_bitmaps(
+	struct xfs_scrub	*sc,
+	struct xchk_rmap	*cr)
+{
+	struct xfs_btree_cur	*cur = sc->sa.rmap_cur;
+	unsigned int		level;
+
+	if (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+				XFS_SCRUB_OFLAG_XFAIL))
+		return;
+	if (!cur)
+		return;
+	level = cur->bc_nlevels - 1;
+
+	/*
+	 * Any bitmap with bits still set indicates that the reverse mapping
+	 * doesn't cover the entire primary structure.
+	 */
+	if (xagb_bitmap_hweight(&cr->fs_owned) != 0)
+		xchk_btree_xref_set_corrupt(sc, cur, level);
+
+	if (xagb_bitmap_hweight(&cr->log_owned) != 0)
+		xchk_btree_xref_set_corrupt(sc, cur, level);
+
+	if (xagb_bitmap_hweight(&cr->ag_owned) != 0)
+		xchk_btree_xref_set_corrupt(sc, cur, level);
+
+	if (xagb_bitmap_hweight(&cr->inobt_owned) != 0)
+		xchk_btree_xref_set_corrupt(sc, cur, level);
+
+	if (xagb_bitmap_hweight(&cr->refcbt_owned) != 0)
+		xchk_btree_xref_set_corrupt(sc, cur, level);
 }
 
 /* Scrub the rmap btree for some AG. */
@@ -165,42 +536,63 @@ int
 xchk_rmapbt(
 	struct xfs_scrub	*sc)
 {
-	return xchk_btree(sc, sc->sa.rmap_cur, xchk_rmapbt_rec,
-			&XFS_RMAP_OINFO_AG, NULL);
+	struct xchk_rmap	*cr;
+	int			error;
+
+	cr = kzalloc(sizeof(struct xchk_rmap), XCHK_GFP_FLAGS);
+	if (!cr)
+		return -ENOMEM;
+
+	xagb_bitmap_init(&cr->fs_owned);
+	xagb_bitmap_init(&cr->log_owned);
+	xagb_bitmap_init(&cr->ag_owned);
+	xagb_bitmap_init(&cr->inobt_owned);
+	xagb_bitmap_init(&cr->refcbt_owned);
+
+	error = xchk_rmapbt_walk_ag_metadata(sc, cr);
+	if (error)
+		goto out;
+
+	error = xchk_btree(sc, sc->sa.rmap_cur, xchk_rmapbt_rec,
+			&XFS_RMAP_OINFO_AG, cr);
+	if (error)
+		goto out;
+
+	xchk_rmapbt_check_bitmaps(sc, cr);
+
+out:
+	xagb_bitmap_destroy(&cr->refcbt_owned);
+	xagb_bitmap_destroy(&cr->inobt_owned);
+	xagb_bitmap_destroy(&cr->ag_owned);
+	xagb_bitmap_destroy(&cr->log_owned);
+	xagb_bitmap_destroy(&cr->fs_owned);
+	kfree(cr);
+	return error;
 }
 
-/* xref check that the extent is owned by a given owner */
-static inline void
-xchk_xref_check_owner(
+/* xref check that the extent is owned only by a given owner */
+void
+xchk_xref_is_only_owned_by(
 	struct xfs_scrub		*sc,
 	xfs_agblock_t			bno,
 	xfs_extlen_t			len,
-	const struct xfs_owner_info	*oinfo,
-	bool				should_have_rmap)
+	const struct xfs_owner_info	*oinfo)
 {
-	bool				has_rmap;
+	struct xfs_rmap_matches		res;
 	int				error;
 
 	if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm))
 		return;
 
-	error = xfs_rmap_record_exists(sc->sa.rmap_cur, bno, len, oinfo,
-			&has_rmap);
+	error = xfs_rmap_count_owners(sc->sa.rmap_cur, bno, len, oinfo, &res);
 	if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
 		return;
-	if (has_rmap != should_have_rmap)
+	if (res.matches != 1)
+		xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+	if (res.bad_non_owner_matches)
+		xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+	if (res.non_owner_matches)
 		xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
-}
-
-/* xref check that the extent is owned by a given owner */
-void
-xchk_xref_is_owned_by(
-	struct xfs_scrub		*sc,
-	xfs_agblock_t			bno,
-	xfs_extlen_t			len,
-	const struct xfs_owner_info	*oinfo)
-{
-	xchk_xref_check_owner(sc, bno, len, oinfo, true);
 }
 
 /* xref check that the extent is not owned by a given owner */
@@ -211,7 +603,19 @@ xchk_xref_is_not_owned_by(
 	xfs_extlen_t			len,
 	const struct xfs_owner_info	*oinfo)
 {
-	xchk_xref_check_owner(sc, bno, len, oinfo, false);
+	struct xfs_rmap_matches		res;
+	int				error;
+
+	if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm))
+		return;
+
+	error = xfs_rmap_count_owners(sc->sa.rmap_cur, bno, len, oinfo, &res);
+	if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
+		return;
+	if (res.matches != 0)
+		xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+	if (res.bad_non_owner_matches)
+		xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
 }
 
 /* xref check that the extent has no reverse mapping at all */
@@ -221,15 +625,15 @@ xchk_xref_has_no_owner(
 	xfs_agblock_t		bno,
 	xfs_extlen_t		len)
 {
-	bool			has_rmap;
+	enum xbtree_recpacking	outcome;
 	int			error;
 
 	if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm))
 		return;
 
-	error = xfs_rmap_has_record(sc->sa.rmap_cur, bno, len, &has_rmap);
+	error = xfs_rmap_has_records(sc->sa.rmap_cur, bno, len, &outcome);
 	if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
 		return;
-	if (has_rmap)
+	if (outcome != XBTREE_RECPACKING_EMPTY)
 		xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
 }
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 0a3bde64c675..e7dace7b4be8 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2017 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 07a7a75f987f..02819bedc5b1 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2017 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -145,6 +145,21 @@ xchk_probe(
 
 /* Scrub setup and teardown */
 
+static inline void
+xchk_fsgates_disable(
+	struct xfs_scrub	*sc)
+{
+	if (!(sc->flags & XCHK_FSGATES_ALL))
+		return;
+
+	trace_xchk_fsgates_disable(sc, sc->flags & XCHK_FSGATES_ALL);
+
+	if (sc->flags & XCHK_FSGATES_DRAIN)
+		xfs_drain_wait_disable();
+
+	sc->flags &= ~XCHK_FSGATES_ALL;
+}
+
 /* Free all the resources and finish the transactions. */
 STATIC int
 xchk_teardown(
@@ -166,7 +181,7 @@ xchk_teardown(
 			xfs_iunlock(sc->ip, sc->ilock_flags);
 		if (sc->ip != ip_in &&
 		    !xfs_internal_inum(sc->mp, sc->ip->i_ino))
-			xfs_irele(sc->ip);
+			xchk_irele(sc, sc->ip);
 		sc->ip = NULL;
 	}
 	if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
@@ -174,9 +189,14 @@ xchk_teardown(
 	if (sc->flags & XCHK_REAPING_DISABLED)
 		xchk_start_reaping(sc);
 	if (sc->buf) {
+		if (sc->buf_cleanup)
+			sc->buf_cleanup(sc->buf);
 		kvfree(sc->buf);
+		sc->buf_cleanup = NULL;
 		sc->buf = NULL;
 	}
+
+	xchk_fsgates_disable(sc);
 	return error;
 }
 
@@ -191,25 +211,25 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
 	},
 	[XFS_SCRUB_TYPE_SB] = {		/* superblock */
 		.type	= ST_PERAG,
-		.setup	= xchk_setup_fs,
+		.setup	= xchk_setup_agheader,
 		.scrub	= xchk_superblock,
 		.repair	= xrep_superblock,
 	},
 	[XFS_SCRUB_TYPE_AGF] = {	/* agf */
 		.type	= ST_PERAG,
-		.setup	= xchk_setup_fs,
+		.setup	= xchk_setup_agheader,
 		.scrub	= xchk_agf,
 		.repair	= xrep_agf,
 	},
 	[XFS_SCRUB_TYPE_AGFL]= {	/* agfl */
 		.type	= ST_PERAG,
-		.setup	= xchk_setup_fs,
+		.setup	= xchk_setup_agheader,
 		.scrub	= xchk_agfl,
 		.repair	= xrep_agfl,
 	},
 	[XFS_SCRUB_TYPE_AGI] = {	/* agi */
 		.type	= ST_PERAG,
-		.setup	= xchk_setup_fs,
+		.setup	= xchk_setup_agheader,
 		.scrub	= xchk_agi,
 		.repair	= xrep_agi,
 	},
@@ -491,23 +511,20 @@ retry_op:
 
 	/* Set up for the operation. */
 	error = sc->ops->setup(sc);
+	if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER))
+		goto try_harder;
+	if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN))
+		goto need_drain;
 	if (error)
 		goto out_teardown;
 
 	/* Scrub for errors. */
 	error = sc->ops->scrub(sc);
-	if (!(sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) {
-		/*
-		 * Scrubbers return -EDEADLOCK to mean 'try harder'.
-		 * Tear down everything we hold, then set up again with
-		 * preparation for worst-case scenarios.
-		 */
-		error = xchk_teardown(sc, 0);
-		if (error)
-			goto out_sc;
-		sc->flags |= XCHK_TRY_HARDER;
-		goto retry_op;
-	} else if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE))
+	if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER))
+		goto try_harder;
+	if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN))
+		goto need_drain;
+	if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE))
 		goto out_teardown;
 
 	xchk_update_health(sc);
@@ -565,4 +582,21 @@ out:
 		error = 0;
 	}
 	return error;
+need_drain:
+	error = xchk_teardown(sc, 0);
+	if (error)
+		goto out_sc;
+	sc->flags |= XCHK_NEED_DRAIN;
+	goto retry_op;
+try_harder:
+	/*
+	 * Scrubbers return -EDEADLOCK to mean 'try harder'.  Tear down
+	 * everything we hold, then set up again with preparation for
+	 * worst-case scenarios.
+	 */
+	error = xchk_teardown(sc, 0);
+	if (error)
+		goto out_sc;
+	sc->flags |= XCHK_TRY_HARDER;
+	goto retry_op;
 }
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index b4d391b4c938..e71903474cd7 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2017 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #ifndef __XFS_SCRUB_SCRUB_H__
 #define __XFS_SCRUB_SCRUB_H__
@@ -77,7 +77,17 @@ struct xfs_scrub {
 	 */
 	struct xfs_inode		*ip;
 
+	/* Kernel memory buffer used by scrubbers; freed at teardown. */
 	void				*buf;
+
+	/*
+	 * Clean up resources owned by whatever is in the buffer.  Cleanup can
+	 * be deferred with this hook as a means for scrub functions to pass
+	 * data to repair functions.  This function must not free the buffer
+	 * itself.
+	 */
+	void				(*buf_cleanup)(void *buf);
+
 	uint				ilock_flags;
 
 	/* See the XCHK/XREP state flags below. */
@@ -96,9 +106,19 @@ struct xfs_scrub {
 
 /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
 #define XCHK_TRY_HARDER		(1 << 0)  /* can't get resources, try again */
-#define XCHK_REAPING_DISABLED	(1 << 2)  /* background block reaping paused */
+#define XCHK_REAPING_DISABLED	(1 << 1)  /* background block reaping paused */
+#define XCHK_FSGATES_DRAIN	(1 << 2)  /* defer ops draining enabled */
+#define XCHK_NEED_DRAIN		(1 << 3)  /* scrub needs to drain defer ops */
 #define XREP_ALREADY_FIXED	(1 << 31) /* checking our repair work */
 
+/*
+ * The XCHK_FSGATES* flags reflect functionality in the main filesystem that
+ * are only enabled for this particular online fsck.  When not in use, the
+ * features are gated off via dynamic code patching, which is why the state
+ * must be enabled during scrub setup and can only be torn down afterwards.
+ */
+#define XCHK_FSGATES_ALL	(XCHK_FSGATES_DRAIN)
+
 /* Metadata scrubbers */
 int xchk_tester(struct xfs_scrub *sc);
 int xchk_superblock(struct xfs_scrub *sc);
@@ -152,7 +172,7 @@ void xchk_xref_is_not_inode_chunk(struct xfs_scrub *sc, xfs_agblock_t agbno,
 		xfs_extlen_t len);
 void xchk_xref_is_inode_chunk(struct xfs_scrub *sc, xfs_agblock_t agbno,
 		xfs_extlen_t len);
-void xchk_xref_is_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno,
+void xchk_xref_is_only_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno,
 		xfs_extlen_t len, const struct xfs_owner_info *oinfo);
 void xchk_xref_is_not_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno,
 		xfs_extlen_t len, const struct xfs_owner_info *oinfo);
@@ -162,6 +182,8 @@ void xchk_xref_is_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno,
 		xfs_extlen_t len);
 void xchk_xref_is_not_shared(struct xfs_scrub *sc, xfs_agblock_t bno,
 		xfs_extlen_t len);
+void xchk_xref_is_not_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno,
+		xfs_extlen_t len);
 #ifdef CONFIG_XFS_RT
 void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno,
 		xfs_extlen_t len);
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index c1c99ffe7408..38708fb9a5d7 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2017 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index b5f94676c37c..0a975439d2b6 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2017 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 93ece6df02e3..68efd6fda61c 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2017 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  *
  * NOTE: none of these tracepoints shall be considered a stable kernel ABI
  * as they can change at any time.  See xfs_trace.h for documentation of
@@ -30,6 +30,9 @@ TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi);
 TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi);
 TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi);
 
+TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED);
+TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW);
+
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PROBE);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SB);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGF);
@@ -93,6 +96,13 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
 	{ XFS_SCRUB_OFLAG_WARNING,		"warning" }, \
 	{ XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED,	"norepair" }
 
+#define XFS_SCRUB_STATE_STRINGS \
+	{ XCHK_TRY_HARDER,			"try_harder" }, \
+	{ XCHK_REAPING_DISABLED,		"reaping_disabled" }, \
+	{ XCHK_FSGATES_DRAIN,			"fsgates_drain" }, \
+	{ XCHK_NEED_DRAIN,			"need_drain" }, \
+	{ XREP_ALREADY_FIXED,			"already_fixed" }
+
 DECLARE_EVENT_CLASS(xchk_class,
 	TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm,
 		 int error),
@@ -139,6 +149,33 @@ DEFINE_SCRUB_EVENT(xchk_deadlock_retry);
 DEFINE_SCRUB_EVENT(xrep_attempt);
 DEFINE_SCRUB_EVENT(xrep_done);
 
+DECLARE_EVENT_CLASS(xchk_fsgate_class,
+	TP_PROTO(struct xfs_scrub *sc, unsigned int fsgate_flags),
+	TP_ARGS(sc, fsgate_flags),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, type)
+		__field(unsigned int, fsgate_flags)
+	),
+	TP_fast_assign(
+		__entry->dev = sc->mp->m_super->s_dev;
+		__entry->type = sc->sm->sm_type;
+		__entry->fsgate_flags = fsgate_flags;
+	),
+	TP_printk("dev %d:%d type %s fsgates '%s'",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+		  __print_flags(__entry->fsgate_flags, "|", XFS_SCRUB_STATE_STRINGS))
+)
+
+#define DEFINE_SCRUB_FSHOOK_EVENT(name) \
+DEFINE_EVENT(xchk_fsgate_class, name, \
+	TP_PROTO(struct xfs_scrub *sc, unsigned int fsgates_flags), \
+	TP_ARGS(sc, fsgates_flags))
+
+DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_enable);
+DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_disable);
+
 TRACE_EVENT(xchk_op_error,
 	TP_PROTO(struct xfs_scrub *sc, xfs_agnumber_t agno,
 		 xfs_agblock_t bno, int error, void *ret_ip),
@@ -657,6 +694,38 @@ TRACE_EVENT(xchk_fscounters_within_range,
 		  __entry->old_value)
 )
 
+TRACE_EVENT(xchk_refcount_incorrect,
+	TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *irec,
+		 xfs_nlink_t seen),
+	TP_ARGS(pag, irec, seen),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(enum xfs_refc_domain, domain)
+		__field(xfs_agblock_t, startblock)
+		__field(xfs_extlen_t, blockcount)
+		__field(xfs_nlink_t, refcount)
+		__field(xfs_nlink_t, seen)
+	),
+	TP_fast_assign(
+		__entry->dev = pag->pag_mount->m_super->s_dev;
+		__entry->agno = pag->pag_agno;
+		__entry->domain = irec->rc_domain;
+		__entry->startblock = irec->rc_startblock;
+		__entry->blockcount = irec->rc_blockcount;
+		__entry->refcount = irec->rc_refcount;
+		__entry->seen = seen;
+	),
+	TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u seen %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
+		  __entry->startblock,
+		  __entry->blockcount,
+		  __entry->refcount,
+		  __entry->seen)
+)
+
 /* repair tracepoints */
 #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
 
diff --git a/fs/xfs/scrub/xfs_scrub.h b/fs/xfs/scrub/xfs_scrub.h
index 2ceae614ade8..a39befa743ce 100644
--- a/fs/xfs/scrub/xfs_scrub.h
+++ b/fs/xfs/scrub/xfs_scrub.h
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright (C) 2017 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #ifndef __XFS_SCRUB_H__
 #define __XFS_SCRUB_H__
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 6e2f0013380a..7551c3ec4ea5 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -24,6 +24,7 @@
 #include "xfs_error.h"
 #include "xfs_log_priv.h"
 #include "xfs_log_recover.h"
+#include "xfs_ag.h"
 
 struct kmem_cache	*xfs_bui_cache;
 struct kmem_cache	*xfs_bud_cache;
@@ -363,6 +364,34 @@ xfs_bmap_update_create_done(
 	return &xfs_trans_get_bud(tp, BUI_ITEM(intent))->bud_item;
 }
 
+/* Take a passive ref to the AG containing the space we're mapping. */
+void
+xfs_bmap_update_get_group(
+	struct xfs_mount	*mp,
+	struct xfs_bmap_intent	*bi)
+{
+	xfs_agnumber_t		agno;
+
+	agno = XFS_FSB_TO_AGNO(mp, bi->bi_bmap.br_startblock);
+
+	/*
+	 * Bump the intent count on behalf of the deferred rmap and refcount
+	 * intent items that that we can queue when we finish this bmap work.
+	 * This new intent item will bump the intent count before the bmap
+	 * intent drops the intent count, ensuring that the intent count
+	 * remains nonzero across the transaction roll.
+	 */
+	bi->bi_pag = xfs_perag_intent_get(mp, agno);
+}
+
+/* Release a passive AG ref after finishing mapping work. */
+static inline void
+xfs_bmap_update_put_group(
+	struct xfs_bmap_intent	*bi)
+{
+	xfs_perag_intent_put(bi->bi_pag);
+}
+
 /* Process a deferred rmap update. */
 STATIC int
 xfs_bmap_update_finish_item(
@@ -381,6 +410,8 @@ xfs_bmap_update_finish_item(
 		ASSERT(bi->bi_type == XFS_BMAP_UNMAP);
 		return -EAGAIN;
 	}
+
+	xfs_bmap_update_put_group(bi);
 	kmem_cache_free(xfs_bmap_intent_cache, bi);
 	return error;
 }
@@ -393,7 +424,7 @@ xfs_bmap_update_abort_intent(
 	xfs_bui_release(BUI_ITEM(intent));
 }
 
-/* Cancel a deferred rmap update. */
+/* Cancel a deferred bmap update. */
 STATIC void
 xfs_bmap_update_cancel_item(
 	struct list_head		*item)
@@ -401,6 +432,8 @@ xfs_bmap_update_cancel_item(
 	struct xfs_bmap_intent		*bi;
 
 	bi = container_of(item, struct xfs_bmap_intent, bi_list);
+
+	xfs_bmap_update_put_group(bi);
 	kmem_cache_free(xfs_bmap_intent_cache, bi);
 }
 
@@ -509,10 +542,12 @@ xfs_bui_item_recover(
 	fake.bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
 			XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
 
+	xfs_bmap_update_get_group(mp, &fake);
 	error = xfs_trans_log_finish_bmap_update(tp, budp, &fake);
 	if (error == -EFSCORRUPTED)
 		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map,
 				sizeof(*map));
+	xfs_bmap_update_put_group(&fake);
 	if (error)
 		goto err_cancel;
 
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index a09dd2606479..f032d3a4b727 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -314,15 +314,13 @@ xfs_getbmap_report_one(
 	if (isnullstartblock(got->br_startblock) ||
 	    got->br_startblock == DELAYSTARTBLOCK) {
 		/*
-		 * Delalloc extents that start beyond EOF can occur due to
-		 * speculative EOF allocation when the delalloc extent is larger
-		 * than the largest freespace extent at conversion time.  These
-		 * extents cannot be converted by data writeback, so can exist
-		 * here even if we are not supposed to be finding delalloc
-		 * extents.
+		 * Take the flush completion as being a point-in-time snapshot
+		 * where there are no delalloc extents, and if any new ones
+		 * have been created racily, just skip them as being 'after'
+		 * the flush and so don't get reported.
 		 */
-		if (got->br_startoff < XFS_B_TO_FSB(ip->i_mount, XFS_ISIZE(ip)))
-			ASSERT((bmv->bmv_iflags & BMV_IF_DELALLOC) != 0);
+		if (!(bmv->bmv_iflags & BMV_IF_DELALLOC))
+			return 0;
 
 		p->bmv_oflags |= BMV_OF_DELALLOC;
 		p->bmv_block = -2;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 54c774af6e1c..15d1e5a7c2d3 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -286,8 +286,7 @@ xfs_buf_free_pages(
 		if (bp->b_pages[i])
 			__free_page(bp->b_pages[i]);
 	}
-	if (current->reclaim_state)
-		current->reclaim_state->reclaimed_slab += bp->b_page_count;
+	mm_account_reclaimed_pages(bp->b_page_count);
 
 	if (bp->b_pages != bp->b_page_array)
 		kmem_free(bp->b_pages);
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index ffa94102094d..43167f543afc 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -943,6 +943,16 @@ xlog_recover_buf_commit_pass2(
 	if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
 		trace_xfs_log_recover_buf_skip(log, buf_f);
 		xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
+
+		/*
+		 * We're skipping replay of this buffer log item due to the log
+		 * item LSN being behind the ondisk buffer.  Verify the buffer
+		 * contents since we aren't going to run the write verifier.
+		 */
+		if (bp->b_ops) {
+			bp->b_ops->verify_read(bp);
+			error = bp->b_error;
+		}
 		goto out_release;
 	}
 
diff --git a/fs/xfs/xfs_dahash_test.c b/fs/xfs/xfs_dahash_test.c
index 230651ab5ce4..0dab5941e080 100644
--- a/fs/xfs/xfs_dahash_test.c
+++ b/fs/xfs/xfs_dahash_test.c
@@ -9,6 +9,9 @@
 #include "xfs_format.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_dir2_priv.h"
 #include "xfs_dahash_test.h"
 
 /* 4096 random bytes */
@@ -533,108 +536,109 @@ static struct dahash_test {
 	uint16_t	start;	/* random 12 bit offset in buf */
 	uint16_t	length;	/* random 8 bit length of test */
 	xfs_dahash_t	dahash;	/* expected dahash result */
+	xfs_dahash_t	ascii_ci_dahash; /* expected ascii-ci dahash result */
 } test[] __initdata =
 {
-	{0x0567, 0x0097, 0x96951389},
-	{0x0869, 0x0055, 0x6455ab4f},
-	{0x0c51, 0x00be, 0x8663afde},
-	{0x044a, 0x00fc, 0x98fbe432},
-	{0x0f29, 0x0079, 0x42371997},
-	{0x08ba, 0x0052, 0x942be4f7},
-	{0x01f2, 0x0013, 0x5262687e},
-	{0x09e3, 0x00e2, 0x8ffb0908},
-	{0x007c, 0x0051, 0xb3158491},
-	{0x0854, 0x001f, 0x83bb20d9},
-	{0x031b, 0x0008, 0x98970bdf},
-	{0x0de7, 0x0027, 0xbfbf6f6c},
-	{0x0f76, 0x0005, 0x906a7105},
-	{0x092e, 0x00d0, 0x86631850},
-	{0x0233, 0x0082, 0xdbdd914e},
-	{0x04c9, 0x0075, 0x5a400a9e},
-	{0x0b66, 0x0099, 0xae128b45},
-	{0x000d, 0x00ed, 0xe61c216a},
-	{0x0a31, 0x003d, 0xf69663b9},
-	{0x00a3, 0x0052, 0x643c39ae},
-	{0x0125, 0x00d5, 0x7c310b0d},
-	{0x0105, 0x004a, 0x06a77e74},
-	{0x0858, 0x008e, 0x265bc739},
-	{0x045e, 0x0095, 0x13d6b192},
-	{0x0dab, 0x003c, 0xc4498704},
-	{0x00cd, 0x00b5, 0x802a4e2d},
-	{0x069b, 0x008c, 0x5df60f71},
-	{0x0454, 0x006c, 0x5f03d8bb},
-	{0x040e, 0x0032, 0x0ce513b5},
-	{0x0874, 0x00e2, 0x6a811fb3},
-	{0x0521, 0x00b4, 0x93296833},
-	{0x0ddc, 0x00cf, 0xf9305338},
-	{0x0a70, 0x0023, 0x239549ea},
-	{0x083e, 0x0027, 0x2d88ba97},
-	{0x0241, 0x00a7, 0xfe0b32e1},
-	{0x0dfc, 0x0096, 0x1a11e815},
-	{0x023e, 0x001e, 0xebc9a1f3},
-	{0x067e, 0x0066, 0xb1067f81},
-	{0x09ea, 0x000e, 0x46fd7247},
-	{0x036b, 0x008c, 0x1a39acdf},
-	{0x078f, 0x0030, 0x964042ab},
-	{0x085c, 0x008f, 0x1829edab},
-	{0x02ec, 0x009f, 0x6aefa72d},
-	{0x043b, 0x00ce, 0x65642ff5},
-	{0x0a32, 0x00b8, 0xbd82759e},
-	{0x0d3c, 0x0087, 0xf4d66d54},
-	{0x09ec, 0x008a, 0x06bfa1ff},
-	{0x0902, 0x0015, 0x755025d2},
-	{0x08fe, 0x000e, 0xf690ce2d},
-	{0x00fb, 0x00dc, 0xe55f1528},
-	{0x0eaa, 0x003a, 0x0fe0a8d7},
-	{0x05fb, 0x0006, 0x86281cfb},
-	{0x0dd1, 0x00a7, 0x60ab51b4},
-	{0x0005, 0x001b, 0xf51d969b},
-	{0x077c, 0x00dd, 0xc2fed268},
-	{0x0575, 0x00f5, 0x432c0b1a},
-	{0x05be, 0x0088, 0x78baa04b},
-	{0x0c89, 0x0068, 0xeda9e428},
-	{0x0f5c, 0x0068, 0xec143c76},
-	{0x06a8, 0x0009, 0xd72651ce},
-	{0x060f, 0x008e, 0x765426cd},
-	{0x07b1, 0x0047, 0x2cfcfa0c},
-	{0x04f1, 0x0041, 0x55b172f9},
-	{0x0e05, 0x00ac, 0x61efde93},
-	{0x0bf7, 0x0097, 0x05b83eee},
-	{0x04e9, 0x00f3, 0x9928223a},
-	{0x023a, 0x0005, 0xdfada9bc},
-	{0x0acb, 0x000e, 0x2217cecd},
-	{0x0148, 0x0060, 0xbc3f7405},
-	{0x0764, 0x0059, 0xcbc201b1},
-	{0x021f, 0x0059, 0x5d6b2256},
-	{0x0f1e, 0x006c, 0xdefeeb45},
-	{0x071c, 0x00b9, 0xb9b59309},
-	{0x0564, 0x0063, 0xae064271},
-	{0x0b14, 0x0044, 0xdb867d9b},
-	{0x0e5a, 0x0055, 0xff06b685},
-	{0x015e, 0x00ba, 0x1115ccbc},
-	{0x0379, 0x00e6, 0x5f4e58dd},
-	{0x013b, 0x0067, 0x4897427e},
-	{0x0e64, 0x0071, 0x7af2b7a4},
-	{0x0a11, 0x0050, 0x92105726},
-	{0x0109, 0x0055, 0xd0d000f9},
-	{0x00aa, 0x0022, 0x815d229d},
-	{0x09ac, 0x004f, 0x02f9d985},
-	{0x0e1b, 0x00ce, 0x5cf92ab4},
-	{0x08af, 0x00d8, 0x17ca72d1},
-	{0x0e33, 0x000a, 0xda2dba6b},
-	{0x0ee3, 0x006a, 0xb00048e5},
-	{0x0648, 0x001a, 0x2364b8cb},
-	{0x0315, 0x0085, 0x0596fd0d},
-	{0x0fbb, 0x003e, 0x298230ca},
-	{0x0422, 0x006a, 0x78ada4ab},
-	{0x04ba, 0x0073, 0xced1fbc2},
-	{0x007d, 0x0061, 0x4b7ff236},
-	{0x070b, 0x00d0, 0x261cf0ae},
-	{0x0c1a, 0x0035, 0x8be92ee2},
-	{0x0af8, 0x0063, 0x824dcf03},
-	{0x08f8, 0x006d, 0xd289710c},
-	{0x021b, 0x00ee, 0x6ac1c41d},
-	{0x05b5, 0x00da, 0x8e52f0e2},
+	{0x0567, 0x0097, 0x96951389, 0xc153aa0d},
+	{0x0869, 0x0055, 0x6455ab4f, 0xd07f69bf},
+	{0x0c51, 0x00be, 0x8663afde, 0xf9add90c},
+	{0x044a, 0x00fc, 0x98fbe432, 0xbf2abb76},
+	{0x0f29, 0x0079, 0x42371997, 0x282588b3},
+	{0x08ba, 0x0052, 0x942be4f7, 0x2e023547},
+	{0x01f2, 0x0013, 0x5262687e, 0x5266287e},
+	{0x09e3, 0x00e2, 0x8ffb0908, 0x1da892f3},
+	{0x007c, 0x0051, 0xb3158491, 0xb67f9e63},
+	{0x0854, 0x001f, 0x83bb20d9, 0x22bb21db},
+	{0x031b, 0x0008, 0x98970bdf, 0x9cd70adf},
+	{0x0de7, 0x0027, 0xbfbf6f6c, 0xae3f296c},
+	{0x0f76, 0x0005, 0x906a7105, 0x906a7105},
+	{0x092e, 0x00d0, 0x86631850, 0xa3f6ac04},
+	{0x0233, 0x0082, 0xdbdd914e, 0x5d8c7aac},
+	{0x04c9, 0x0075, 0x5a400a9e, 0x12f60711},
+	{0x0b66, 0x0099, 0xae128b45, 0x7551310d},
+	{0x000d, 0x00ed, 0xe61c216a, 0xc22d3c4c},
+	{0x0a31, 0x003d, 0xf69663b9, 0x51960bf8},
+	{0x00a3, 0x0052, 0x643c39ae, 0xa93c73a8},
+	{0x0125, 0x00d5, 0x7c310b0d, 0xf221cbb3},
+	{0x0105, 0x004a, 0x06a77e74, 0xa4ef4561},
+	{0x0858, 0x008e, 0x265bc739, 0xd6c36d9b},
+	{0x045e, 0x0095, 0x13d6b192, 0x5f5c1d62},
+	{0x0dab, 0x003c, 0xc4498704, 0x10414654},
+	{0x00cd, 0x00b5, 0x802a4e2d, 0xfbd17c9d},
+	{0x069b, 0x008c, 0x5df60f71, 0x91ddca5f},
+	{0x0454, 0x006c, 0x5f03d8bb, 0x5c59fce0},
+	{0x040e, 0x0032, 0x0ce513b5, 0xa8cd99b1},
+	{0x0874, 0x00e2, 0x6a811fb3, 0xca028316},
+	{0x0521, 0x00b4, 0x93296833, 0x2c4d4880},
+	{0x0ddc, 0x00cf, 0xf9305338, 0x2c94210d},
+	{0x0a70, 0x0023, 0x239549ea, 0x22b561aa},
+	{0x083e, 0x0027, 0x2d88ba97, 0x5cd8bb9d},
+	{0x0241, 0x00a7, 0xfe0b32e1, 0x17b506b8},
+	{0x0dfc, 0x0096, 0x1a11e815, 0xee4141bd},
+	{0x023e, 0x001e, 0xebc9a1f3, 0x5689a1f3},
+	{0x067e, 0x0066, 0xb1067f81, 0xd9952571},
+	{0x09ea, 0x000e, 0x46fd7247, 0x42b57245},
+	{0x036b, 0x008c, 0x1a39acdf, 0x58bf1586},
+	{0x078f, 0x0030, 0x964042ab, 0xb04218b9},
+	{0x085c, 0x008f, 0x1829edab, 0x9ceca89c},
+	{0x02ec, 0x009f, 0x6aefa72d, 0x634cc2a7},
+	{0x043b, 0x00ce, 0x65642ff5, 0x6c8a584e},
+	{0x0a32, 0x00b8, 0xbd82759e, 0x0f96a34f},
+	{0x0d3c, 0x0087, 0xf4d66d54, 0xb71ba5f4},
+	{0x09ec, 0x008a, 0x06bfa1ff, 0x576ca80f},
+	{0x0902, 0x0015, 0x755025d2, 0x517225c2},
+	{0x08fe, 0x000e, 0xf690ce2d, 0xf690cf3d},
+	{0x00fb, 0x00dc, 0xe55f1528, 0x707d7d92},
+	{0x0eaa, 0x003a, 0x0fe0a8d7, 0x87638cc5},
+	{0x05fb, 0x0006, 0x86281cfb, 0x86281cf9},
+	{0x0dd1, 0x00a7, 0x60ab51b4, 0xe28ef00c},
+	{0x0005, 0x001b, 0xf51d969b, 0xe71dd6d3},
+	{0x077c, 0x00dd, 0xc2fed268, 0xdc30c555},
+	{0x0575, 0x00f5, 0x432c0b1a, 0x81dd7d16},
+	{0x05be, 0x0088, 0x78baa04b, 0xd69b433e},
+	{0x0c89, 0x0068, 0xeda9e428, 0xe9b4fa0a},
+	{0x0f5c, 0x0068, 0xec143c76, 0x9947067a},
+	{0x06a8, 0x0009, 0xd72651ce, 0xd72651ee},
+	{0x060f, 0x008e, 0x765426cd, 0x2099626f},
+	{0x07b1, 0x0047, 0x2cfcfa0c, 0x1a4baa07},
+	{0x04f1, 0x0041, 0x55b172f9, 0x15331a79},
+	{0x0e05, 0x00ac, 0x61efde93, 0x320568cc},
+	{0x0bf7, 0x0097, 0x05b83eee, 0xc72fb7a3},
+	{0x04e9, 0x00f3, 0x9928223a, 0xe8c77de2},
+	{0x023a, 0x0005, 0xdfada9bc, 0xdfadb9be},
+	{0x0acb, 0x000e, 0x2217cecd, 0x0017d6cd},
+	{0x0148, 0x0060, 0xbc3f7405, 0xf5fd6615},
+	{0x0764, 0x0059, 0xcbc201b1, 0xbb089bf4},
+	{0x021f, 0x0059, 0x5d6b2256, 0xa16a0a59},
+	{0x0f1e, 0x006c, 0xdefeeb45, 0xfc34f9d6},
+	{0x071c, 0x00b9, 0xb9b59309, 0xb645eae2},
+	{0x0564, 0x0063, 0xae064271, 0x954dc6d1},
+	{0x0b14, 0x0044, 0xdb867d9b, 0xdf432309},
+	{0x0e5a, 0x0055, 0xff06b685, 0xa65ff257},
+	{0x015e, 0x00ba, 0x1115ccbc, 0x11c365f4},
+	{0x0379, 0x00e6, 0x5f4e58dd, 0x2d176d31},
+	{0x013b, 0x0067, 0x4897427e, 0xc40532fe},
+	{0x0e64, 0x0071, 0x7af2b7a4, 0x1fb7bf43},
+	{0x0a11, 0x0050, 0x92105726, 0xb1185e51},
+	{0x0109, 0x0055, 0xd0d000f9, 0x60a60bfd},
+	{0x00aa, 0x0022, 0x815d229d, 0x215d379c},
+	{0x09ac, 0x004f, 0x02f9d985, 0x10b90b20},
+	{0x0e1b, 0x00ce, 0x5cf92ab4, 0x6a477573},
+	{0x08af, 0x00d8, 0x17ca72d1, 0x385af156},
+	{0x0e33, 0x000a, 0xda2dba6b, 0xda2dbb69},
+	{0x0ee3, 0x006a, 0xb00048e5, 0xa9a2decc},
+	{0x0648, 0x001a, 0x2364b8cb, 0x3364b1cb},
+	{0x0315, 0x0085, 0x0596fd0d, 0xa651740f},
+	{0x0fbb, 0x003e, 0x298230ca, 0x7fc617c7},
+	{0x0422, 0x006a, 0x78ada4ab, 0xc576ae2a},
+	{0x04ba, 0x0073, 0xced1fbc2, 0xaac8455b},
+	{0x007d, 0x0061, 0x4b7ff236, 0x347d5739},
+	{0x070b, 0x00d0, 0x261cf0ae, 0xc7fb1c10},
+	{0x0c1a, 0x0035, 0x8be92ee2, 0x8be9b4e1},
+	{0x0af8, 0x0063, 0x824dcf03, 0x53010388},
+	{0x08f8, 0x006d, 0xd289710c, 0x30418edd},
+	{0x021b, 0x00ee, 0x6ac1c41d, 0x2557e9a3},
+	{0x05b5, 0x00da, 0x8e52f0e2, 0x98531012},
 };
 
 int __init
@@ -644,12 +648,19 @@ xfs_dahash_test(void)
 	unsigned int	errors = 0;
 
 	for (i = 0; i < ARRAY_SIZE(test); i++) {
+		struct xfs_name	xname = { };
 		xfs_dahash_t	hash;
 
 		hash = xfs_da_hashname(test_buf + test[i].start,
 				test[i].length);
 		if (hash != test[i].dahash)
 			errors++;
+
+		xname.name = test_buf + test[i].start;
+		xname.len = test[i].length;
+		hash = xfs_ascii_ci_hashname(&xname);
+		if (hash != test[i].ascii_ci_dahash)
+			errors++;
 	}
 
 	if (errors) {
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 8fb90da89787..7f071757f278 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -798,7 +798,6 @@ xfs_qm_dqget_cache_insert(
 	error = radix_tree_insert(tree, id, dqp);
 	if (unlikely(error)) {
 		/* Duplicate found!  Caller must try again. */
-		WARN_ON(error != -EEXIST);
 		mutex_unlock(&qi->qi_tree_lock);
 		trace_xfs_dqget_dup(dqp);
 		return error;
diff --git a/fs/xfs/xfs_drain.c b/fs/xfs/xfs_drain.c
new file mode 100644
index 000000000000..005a66be44a2
--- /dev/null
+++ b/fs/xfs/xfs_drain.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_ag.h"
+#include "xfs_trace.h"
+
+/*
+ * Use a static key here to reduce the overhead of xfs_drain_rele.  If the
+ * compiler supports jump labels, the static branch will be replaced by a nop
+ * sled when there are no xfs_drain_wait callers.  Online fsck is currently
+ * the only caller, so this is a reasonable tradeoff.
+ *
+ * Note: Patching the kernel code requires taking the cpu hotplug lock.  Other
+ * parts of the kernel allocate memory with that lock held, which means that
+ * XFS callers cannot hold any locks that might be used by memory reclaim or
+ * writeback when calling the static_branch_{inc,dec} functions.
+ */
+static DEFINE_STATIC_KEY_FALSE(xfs_drain_waiter_gate);
+
+void
+xfs_drain_wait_disable(void)
+{
+	static_branch_dec(&xfs_drain_waiter_gate);
+}
+
+void
+xfs_drain_wait_enable(void)
+{
+	static_branch_inc(&xfs_drain_waiter_gate);
+}
+
+void
+xfs_defer_drain_init(
+	struct xfs_defer_drain	*dr)
+{
+	atomic_set(&dr->dr_count, 0);
+	init_waitqueue_head(&dr->dr_waiters);
+}
+
+void
+xfs_defer_drain_free(struct xfs_defer_drain	*dr)
+{
+	ASSERT(atomic_read(&dr->dr_count) == 0);
+}
+
+/* Increase the pending intent count. */
+static inline void xfs_defer_drain_grab(struct xfs_defer_drain *dr)
+{
+	atomic_inc(&dr->dr_count);
+}
+
+static inline bool has_waiters(struct wait_queue_head *wq_head)
+{
+	/*
+	 * This memory barrier is paired with the one in set_current_state on
+	 * the waiting side.
+	 */
+	smp_mb__after_atomic();
+	return waitqueue_active(wq_head);
+}
+
+/* Decrease the pending intent count, and wake any waiters, if appropriate. */
+static inline void xfs_defer_drain_rele(struct xfs_defer_drain *dr)
+{
+	if (atomic_dec_and_test(&dr->dr_count) &&
+	    static_branch_unlikely(&xfs_drain_waiter_gate) &&
+	    has_waiters(&dr->dr_waiters))
+		wake_up(&dr->dr_waiters);
+}
+
+/* Are there intents pending? */
+static inline bool xfs_defer_drain_busy(struct xfs_defer_drain *dr)
+{
+	return atomic_read(&dr->dr_count) > 0;
+}
+
+/*
+ * Wait for the pending intent count for a drain to hit zero.
+ *
+ * Callers must not hold any locks that would prevent intents from being
+ * finished.
+ */
+static inline int xfs_defer_drain_wait(struct xfs_defer_drain *dr)
+{
+	return wait_event_killable(dr->dr_waiters, !xfs_defer_drain_busy(dr));
+}
+
+/*
+ * Get a passive reference to an AG and declare an intent to update its
+ * metadata.
+ */
+struct xfs_perag *
+xfs_perag_intent_get(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno)
+{
+	struct xfs_perag	*pag;
+
+	pag = xfs_perag_get(mp, agno);
+	if (!pag)
+		return NULL;
+
+	xfs_perag_intent_hold(pag);
+	return pag;
+}
+
+/*
+ * Release our intent to update this AG's metadata, and then release our
+ * passive ref to the AG.
+ */
+void
+xfs_perag_intent_put(
+	struct xfs_perag	*pag)
+{
+	xfs_perag_intent_rele(pag);
+	xfs_perag_put(pag);
+}
+
+/*
+ * Declare an intent to update AG metadata.  Other threads that need exclusive
+ * access can decide to back off if they see declared intentions.
+ */
+void
+xfs_perag_intent_hold(
+	struct xfs_perag	*pag)
+{
+	trace_xfs_perag_intent_hold(pag, __return_address);
+	xfs_defer_drain_grab(&pag->pag_intents_drain);
+}
+
+/* Release our intent to update this AG's metadata. */
+void
+xfs_perag_intent_rele(
+	struct xfs_perag	*pag)
+{
+	trace_xfs_perag_intent_rele(pag, __return_address);
+	xfs_defer_drain_rele(&pag->pag_intents_drain);
+}
+
+/*
+ * Wait for the intent update count for this AG to hit zero.
+ * Callers must not hold any AG header buffers.
+ */
+int
+xfs_perag_intent_drain(
+	struct xfs_perag	*pag)
+{
+	trace_xfs_perag_wait_intents(pag, __return_address);
+	return xfs_defer_drain_wait(&pag->pag_intents_drain);
+}
+
+/* Has anyone declared an intent to update this AG? */
+bool
+xfs_perag_intent_busy(
+	struct xfs_perag	*pag)
+{
+	return xfs_defer_drain_busy(&pag->pag_intents_drain);
+}
diff --git a/fs/xfs/xfs_drain.h b/fs/xfs/xfs_drain.h
new file mode 100644
index 000000000000..50a5772a8296
--- /dev/null
+++ b/fs/xfs/xfs_drain.h
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef XFS_DRAIN_H_
+#define XFS_DRAIN_H_
+
+struct xfs_perag;
+
+#ifdef CONFIG_XFS_DRAIN_INTENTS
+/*
+ * Passive drain mechanism.  This data structure tracks a count of some items
+ * and contains a waitqueue for callers who would like to wake up when the
+ * count hits zero.
+ */
+struct xfs_defer_drain {
+	/* Number of items pending in some part of the filesystem. */
+	atomic_t		dr_count;
+
+	/* Queue to wait for dri_count to go to zero */
+	struct wait_queue_head	dr_waiters;
+};
+
+void xfs_defer_drain_init(struct xfs_defer_drain *dr);
+void xfs_defer_drain_free(struct xfs_defer_drain *dr);
+
+void xfs_drain_wait_disable(void);
+void xfs_drain_wait_enable(void);
+
+/*
+ * Deferred Work Intent Drains
+ * ===========================
+ *
+ * When a writer thread executes a chain of log intent items, the AG header
+ * buffer locks will cycle during a transaction roll to get from one intent
+ * item to the next in a chain.  Although scrub takes all AG header buffer
+ * locks, this isn't sufficient to guard against scrub checking an AG while
+ * that writer thread is in the middle of finishing a chain because there's no
+ * higher level locking primitive guarding allocation groups.
+ *
+ * When there's a collision, cross-referencing between data structures (e.g.
+ * rmapbt and refcountbt) yields false corruption events; if repair is running,
+ * this results in incorrect repairs, which is catastrophic.
+ *
+ * The solution is to the perag structure the count of active intents and make
+ * scrub wait until it has both AG header buffer locks and the intent counter
+ * reaches zero.  It is therefore critical that deferred work threads hold the
+ * AGI or AGF buffers when decrementing the intent counter.
+ *
+ * Given a list of deferred work items, the deferred work manager will complete
+ * a work item and all the sub-items that the parent item creates before moving
+ * on to the next work item in the list.  This is also true for all levels of
+ * sub-items.  Writer threads are permitted to queue multiple work items
+ * targetting the same AG, so a deferred work item (such as a BUI) that creates
+ * sub-items (such as RUIs) must bump the intent counter and maintain it until
+ * the sub-items can themselves bump the intent counter.
+ *
+ * Therefore, the intent count tracks entire lifetimes of deferred work items.
+ * All functions that create work items must increment the intent counter as
+ * soon as the item is added to the transaction and cannot drop the counter
+ * until the item is finished or cancelled.
+ */
+struct xfs_perag *xfs_perag_intent_get(struct xfs_mount *mp,
+		xfs_agnumber_t agno);
+void xfs_perag_intent_put(struct xfs_perag *pag);
+
+void xfs_perag_intent_hold(struct xfs_perag *pag);
+void xfs_perag_intent_rele(struct xfs_perag *pag);
+
+int xfs_perag_intent_drain(struct xfs_perag *pag);
+bool xfs_perag_intent_busy(struct xfs_perag *pag);
+#else
+struct xfs_defer_drain { /* empty */ };
+
+#define xfs_defer_drain_free(dr)		((void)0)
+#define xfs_defer_drain_init(dr)		((void)0)
+
+#define xfs_perag_intent_get(mp, agno)		xfs_perag_get((mp), (agno))
+#define xfs_perag_intent_put(pag)		xfs_perag_put(pag)
+
+static inline void xfs_perag_intent_hold(struct xfs_perag *pag) { }
+static inline void xfs_perag_intent_rele(struct xfs_perag *pag) { }
+
+#endif /* CONFIG_XFS_DRAIN_INTENTS */
+
+#endif /* XFS_DRAIN_H_ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 011b50469301..f9e36b810663 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -351,8 +351,6 @@ xfs_trans_free_extent(
 	struct xfs_mount		*mp = tp->t_mountp;
 	struct xfs_extent		*extp;
 	uint				next_extent;
-	xfs_agnumber_t			agno = XFS_FSB_TO_AGNO(mp,
-							xefi->xefi_startblock);
 	xfs_agblock_t			agbno = XFS_FSB_TO_AGBNO(mp,
 							xefi->xefi_startblock);
 	int				error;
@@ -363,12 +361,13 @@ xfs_trans_free_extent(
 	if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK)
 		oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
 
-	trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno,
-			xefi->xefi_blockcount);
+	trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0,
+			agbno, xefi->xefi_blockcount);
 
-	error = __xfs_free_extent(tp, xefi->xefi_startblock,
+	error = __xfs_free_extent(tp, xefi->xefi_pag, agbno,
 			xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE,
 			xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
+
 	/*
 	 * Mark the transaction dirty, even on error. This ensures the
 	 * transaction is aborted, which:
@@ -396,14 +395,13 @@ xfs_extent_free_diff_items(
 	const struct list_head		*a,
 	const struct list_head		*b)
 {
-	struct xfs_mount		*mp = priv;
 	struct xfs_extent_free_item	*ra;
 	struct xfs_extent_free_item	*rb;
 
 	ra = container_of(a, struct xfs_extent_free_item, xefi_list);
 	rb = container_of(b, struct xfs_extent_free_item, xefi_list);
-	return  XFS_FSB_TO_AGNO(mp, ra->xefi_startblock) -
-		XFS_FSB_TO_AGNO(mp, rb->xefi_startblock);
+
+	return ra->xefi_pag->pag_agno - rb->xefi_pag->pag_agno;
 }
 
 /* Log a free extent to the intent item. */
@@ -462,6 +460,26 @@ xfs_extent_free_create_done(
 	return &xfs_trans_get_efd(tp, EFI_ITEM(intent), count)->efd_item;
 }
 
+/* Take a passive ref to the AG containing the space we're freeing. */
+void
+xfs_extent_free_get_group(
+	struct xfs_mount		*mp,
+	struct xfs_extent_free_item	*xefi)
+{
+	xfs_agnumber_t			agno;
+
+	agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock);
+	xefi->xefi_pag = xfs_perag_intent_get(mp, agno);
+}
+
+/* Release a passive AG ref after some freeing work. */
+static inline void
+xfs_extent_free_put_group(
+	struct xfs_extent_free_item	*xefi)
+{
+	xfs_perag_intent_put(xefi->xefi_pag);
+}
+
 /* Process a free extent. */
 STATIC int
 xfs_extent_free_finish_item(
@@ -476,6 +494,8 @@ xfs_extent_free_finish_item(
 	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
 
 	error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
+
+	xfs_extent_free_put_group(xefi);
 	kmem_cache_free(xfs_extfree_item_cache, xefi);
 	return error;
 }
@@ -496,6 +516,8 @@ xfs_extent_free_cancel_item(
 	struct xfs_extent_free_item	*xefi;
 
 	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
+
+	xfs_extent_free_put_group(xefi);
 	kmem_cache_free(xfs_extfree_item_cache, xefi);
 }
 
@@ -526,24 +548,21 @@ xfs_agfl_free_finish_item(
 	struct xfs_extent		*extp;
 	struct xfs_buf			*agbp;
 	int				error;
-	xfs_agnumber_t			agno;
 	xfs_agblock_t			agbno;
 	uint				next_extent;
-	struct xfs_perag		*pag;
 
 	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
 	ASSERT(xefi->xefi_blockcount == 1);
-	agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock);
 	agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock);
 	oinfo.oi_owner = xefi->xefi_owner;
 
-	trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, xefi->xefi_blockcount);
+	trace_xfs_agfl_free_deferred(mp, xefi->xefi_pag->pag_agno, 0, agbno,
+			xefi->xefi_blockcount);
 
-	pag = xfs_perag_get(mp, agno);
-	error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
+	error = xfs_alloc_read_agf(xefi->xefi_pag, tp, 0, &agbp);
 	if (!error)
-		error = xfs_free_agfl_block(tp, agno, agbno, agbp, &oinfo);
-	xfs_perag_put(pag);
+		error = xfs_free_agfl_block(tp, xefi->xefi_pag->pag_agno,
+				agbno, agbp, &oinfo);
 
 	/*
 	 * Mark the transaction dirty, even on error. This ensures the
@@ -562,6 +581,7 @@ xfs_agfl_free_finish_item(
 	extp->ext_len = xefi->xefi_blockcount;
 	efdp->efd_next_extent++;
 
+	xfs_extent_free_put_group(xefi);
 	kmem_cache_free(xfs_extfree_item_cache, xefi);
 	return error;
 }
@@ -632,7 +652,9 @@ xfs_efi_item_recover(
 		fake.xefi_startblock = extp->ext_start;
 		fake.xefi_blockcount = extp->ext_len;
 
+		xfs_extent_free_get_group(mp, &fake);
 		error = xfs_trans_free_extent(tp, efdp, &fake);
+		xfs_extent_free_put_group(&fake);
 		if (error == -EFSCORRUPTED)
 			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
 					extp, sizeof(*extp));
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 863289aaa441..aede746541f8 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1389,25 +1389,10 @@ xfs_filemap_pfn_mkwrite(
 	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
 }
 
-static vm_fault_t
-xfs_filemap_map_pages(
-	struct vm_fault		*vmf,
-	pgoff_t			start_pgoff,
-	pgoff_t			end_pgoff)
-{
-	struct inode		*inode = file_inode(vmf->vma->vm_file);
-	vm_fault_t ret;
-
-	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	ret = filemap_map_pages(vmf, start_pgoff, end_pgoff);
-	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	return ret;
-}
-
 static const struct vm_operations_struct xfs_file_vm_ops = {
 	.fault		= xfs_filemap_fault,
 	.huge_fault	= xfs_filemap_huge_fault,
-	.map_pages	= xfs_filemap_map_pages,
+	.map_pages	= filemap_map_pages,
 	.page_mkwrite	= xfs_filemap_page_mkwrite,
 	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
 };
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index c9a7e270a428..351849fc18ff 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -767,7 +767,8 @@ again:
 	return 0;
 
 out_error_or_again:
-	if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) {
+	if (!(flags & (XFS_IGET_INCORE | XFS_IGET_NORETRY)) &&
+	    error == -EAGAIN) {
 		delay(1);
 		goto again;
 	}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 6cd180721659..87910191a9dd 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -34,10 +34,13 @@ struct xfs_icwalk {
 /*
  * Flags for xfs_iget()
  */
-#define XFS_IGET_CREATE		0x1
-#define XFS_IGET_UNTRUSTED	0x2
-#define XFS_IGET_DONTCACHE	0x4
-#define XFS_IGET_INCORE		0x8	/* don't read from disk or reinit */
+#define XFS_IGET_CREATE		(1U << 0)
+#define XFS_IGET_UNTRUSTED	(1U << 1)
+#define XFS_IGET_DONTCACHE	(1U << 2)
+/* don't read from disk or reinit */
+#define XFS_IGET_INCORE		(1U << 3)
+/* Return -EAGAIN immediately if the inode is unavailable. */
+#define XFS_IGET_NORETRY	(1U << 4)
 
 int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
 	     uint flags, uint lock_flags, xfs_inode_t **ipp);
diff --git a/fs/xfs/xfs_iunlink_item.c b/fs/xfs/xfs_iunlink_item.c
index 43005ce8bd48..2ddccb172fa0 100644
--- a/fs/xfs/xfs_iunlink_item.c
+++ b/fs/xfs/xfs_iunlink_item.c
@@ -168,9 +168,7 @@ xfs_iunlink_log_inode(
 	iup->ip = ip;
 	iup->next_agino = next_agino;
 	iup->old_agino = ip->i_next_unlinked;
-
-	atomic_inc(&pag->pag_ref);
-	iup->pag = pag;
+	iup->pag = xfs_perag_hold(pag);
 
 	xfs_trans_add_item(tp, &iup->item);
 	tp->t_flags |= XFS_TRANS_DIRTY;
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index 21be93bf006d..b3275e8d47b6 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -667,11 +667,10 @@ xfs_iwalk_threaded(
 		iwag->mp = mp;
 
 		/*
-		 * perag is being handed off to async work, so take another
+		 * perag is being handed off to async work, so take a passive
 		 * reference for the async work to release.
 		 */
-		atomic_inc(&pag->pag_ref);
-		iwag->pag = pag;
+		iwag->pag = xfs_perag_hold(pag);
 		iwag->iwalk_fn = iwalk_fn;
 		iwag->data = data;
 		iwag->startino = startino;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index e88f18f85e4b..74dcb05069e8 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -80,6 +80,7 @@ typedef __u32			xfs_nlink_t;
 #include "xfs_cksum.h"
 #include "xfs_buf.h"
 #include "xfs_message.h"
+#include "xfs_drain.h"
 
 #ifdef __BIG_ENDIAN
 #define XFS_NATIVE_HOST 1
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 48d771a76add..edd8587658d5 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -20,6 +20,7 @@
 #include "xfs_error.h"
 #include "xfs_log_priv.h"
 #include "xfs_log_recover.h"
+#include "xfs_ag.h"
 
 struct kmem_cache	*xfs_cui_cache;
 struct kmem_cache	*xfs_cud_cache;
@@ -279,14 +280,13 @@ xfs_refcount_update_diff_items(
 	const struct list_head		*a,
 	const struct list_head		*b)
 {
-	struct xfs_mount		*mp = priv;
 	struct xfs_refcount_intent	*ra;
 	struct xfs_refcount_intent	*rb;
 
 	ra = container_of(a, struct xfs_refcount_intent, ri_list);
 	rb = container_of(b, struct xfs_refcount_intent, ri_list);
-	return  XFS_FSB_TO_AGNO(mp, ra->ri_startblock) -
-		XFS_FSB_TO_AGNO(mp, rb->ri_startblock);
+
+	return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno;
 }
 
 /* Set the phys extent flags for this reverse mapping. */
@@ -365,6 +365,26 @@ xfs_refcount_update_create_done(
 	return &xfs_trans_get_cud(tp, CUI_ITEM(intent))->cud_item;
 }
 
+/* Take a passive ref to the AG containing the space we're refcounting. */
+void
+xfs_refcount_update_get_group(
+	struct xfs_mount		*mp,
+	struct xfs_refcount_intent	*ri)
+{
+	xfs_agnumber_t			agno;
+
+	agno = XFS_FSB_TO_AGNO(mp, ri->ri_startblock);
+	ri->ri_pag = xfs_perag_intent_get(mp, agno);
+}
+
+/* Release a passive AG ref after finishing refcounting work. */
+static inline void
+xfs_refcount_update_put_group(
+	struct xfs_refcount_intent	*ri)
+{
+	xfs_perag_intent_put(ri->ri_pag);
+}
+
 /* Process a deferred refcount update. */
 STATIC int
 xfs_refcount_update_finish_item(
@@ -386,6 +406,8 @@ xfs_refcount_update_finish_item(
 		       ri->ri_type == XFS_REFCOUNT_DECREASE);
 		return -EAGAIN;
 	}
+
+	xfs_refcount_update_put_group(ri);
 	kmem_cache_free(xfs_refcount_intent_cache, ri);
 	return error;
 }
@@ -406,6 +428,8 @@ xfs_refcount_update_cancel_item(
 	struct xfs_refcount_intent	*ri;
 
 	ri = container_of(item, struct xfs_refcount_intent, ri_list);
+
+	xfs_refcount_update_put_group(ri);
 	kmem_cache_free(xfs_refcount_intent_cache, ri);
 }
 
@@ -520,9 +544,13 @@ xfs_cui_item_recover(
 
 		fake.ri_startblock = pmap->pe_startblock;
 		fake.ri_blockcount = pmap->pe_len;
-		if (!requeue_only)
+
+		if (!requeue_only) {
+			xfs_refcount_update_get_group(mp, &fake);
 			error = xfs_trans_log_finish_refcount_update(tp, cudp,
 					&fake, &rcur);
+			xfs_refcount_update_put_group(&fake);
+		}
 		if (error == -EFSCORRUPTED)
 			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
 					&cuip->cui_format,
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index a1619d67015f..520c7ebdfed8 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -20,6 +20,7 @@
 #include "xfs_error.h"
 #include "xfs_log_priv.h"
 #include "xfs_log_recover.h"
+#include "xfs_ag.h"
 
 struct kmem_cache	*xfs_rui_cache;
 struct kmem_cache	*xfs_rud_cache;
@@ -320,14 +321,13 @@ xfs_rmap_update_diff_items(
 	const struct list_head		*a,
 	const struct list_head		*b)
 {
-	struct xfs_mount		*mp = priv;
 	struct xfs_rmap_intent		*ra;
 	struct xfs_rmap_intent		*rb;
 
 	ra = container_of(a, struct xfs_rmap_intent, ri_list);
 	rb = container_of(b, struct xfs_rmap_intent, ri_list);
-	return  XFS_FSB_TO_AGNO(mp, ra->ri_bmap.br_startblock) -
-		XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock);
+
+	return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno;
 }
 
 /* Log rmap updates in the intent item. */
@@ -390,6 +390,26 @@ xfs_rmap_update_create_done(
 	return &xfs_trans_get_rud(tp, RUI_ITEM(intent))->rud_item;
 }
 
+/* Take a passive ref to the AG containing the space we're rmapping. */
+void
+xfs_rmap_update_get_group(
+	struct xfs_mount	*mp,
+	struct xfs_rmap_intent	*ri)
+{
+	xfs_agnumber_t		agno;
+
+	agno = XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock);
+	ri->ri_pag = xfs_perag_intent_get(mp, agno);
+}
+
+/* Release a passive AG ref after finishing rmapping work. */
+static inline void
+xfs_rmap_update_put_group(
+	struct xfs_rmap_intent	*ri)
+{
+	xfs_perag_intent_put(ri->ri_pag);
+}
+
 /* Process a deferred rmap update. */
 STATIC int
 xfs_rmap_update_finish_item(
@@ -405,6 +425,8 @@ xfs_rmap_update_finish_item(
 
 	error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), ri,
 			state);
+
+	xfs_rmap_update_put_group(ri);
 	kmem_cache_free(xfs_rmap_intent_cache, ri);
 	return error;
 }
@@ -425,6 +447,8 @@ xfs_rmap_update_cancel_item(
 	struct xfs_rmap_intent		*ri;
 
 	ri = container_of(item, struct xfs_rmap_intent, ri_list);
+
+	xfs_rmap_update_put_group(ri);
 	kmem_cache_free(xfs_rmap_intent_cache, ri);
 }
 
@@ -559,11 +583,13 @@ xfs_rui_item_recover(
 		fake.ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
 				XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
 
+		xfs_rmap_update_get_group(mp, &fake);
 		error = xfs_trans_log_finish_rmap_update(tp, rudp, &fake,
 				&rcur);
 		if (error == -EFSCORRUPTED)
 			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
 					map, sizeof(*map));
+		xfs_rmap_update_put_group(&fake);
 		if (error)
 			goto abort_error;
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 4f814f9e12ab..4d2e87462ac4 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1548,6 +1548,19 @@ xfs_fs_fill_super(
 #endif
 	}
 
+	/* ASCII case insensitivity is undergoing deprecation. */
+	if (xfs_has_asciici(mp)) {
+#ifdef CONFIG_XFS_SUPPORT_ASCII_CI
+		xfs_warn_once(mp,
+	"Deprecated ASCII case-insensitivity feature (ascii-ci=1) will not be supported after September 2030.");
+#else
+		xfs_warn(mp,
+	"Deprecated ASCII case-insensitivity feature (ascii-ci=1) not supported by kernel.");
+		error = -EINVAL;
+		goto out_free_sb;
+#endif
+	}
+
 	/* Filesystem claims it needs repair, so refuse the mount. */
 	if (xfs_has_needsrepair(mp)) {
 		xfs_warn(mp, "Filesystem needs repair.  Please run xfs_repair.");
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 546a6cd96729..fade33735393 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -210,28 +210,10 @@ static struct ctl_table xfs_table[] = {
 	{}
 };
 
-static struct ctl_table xfs_dir_table[] = {
-	{
-		.procname	= "xfs",
-		.mode		= 0555,
-		.child		= xfs_table
-	},
-	{}
-};
-
-static struct ctl_table xfs_root_table[] = {
-	{
-		.procname	= "fs",
-		.mode		= 0555,
-		.child		= xfs_dir_table
-	},
-	{}
-};
-
 int
 xfs_sysctl_register(void)
 {
-	xfs_table_header = register_sysctl_table(xfs_root_table);
+	xfs_table_header = register_sysctl("fs/xfs", xfs_table);
 	if (!xfs_table_header)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 9c0006c55fec..cd4ca5b1fcb0 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -190,6 +190,7 @@ DEFINE_EVENT(xfs_perag_class, name,	\
 	TP_ARGS(pag, caller_ip))
 DEFINE_PERAG_REF_EVENT(xfs_perag_get);
 DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
+DEFINE_PERAG_REF_EVENT(xfs_perag_hold);
 DEFINE_PERAG_REF_EVENT(xfs_perag_put);
 DEFINE_PERAG_REF_EVENT(xfs_perag_grab);
 DEFINE_PERAG_REF_EVENT(xfs_perag_grab_tag);
@@ -2686,6 +2687,44 @@ DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_deferred);
 DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_defer);
 DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_deferred);
 
+DECLARE_EVENT_CLASS(xfs_defer_pending_item_class,
+	TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp,
+		 void *item),
+	TP_ARGS(mp, dfp, item),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(int, type)
+		__field(void *, intent)
+		__field(void *, item)
+		__field(char, committed)
+		__field(int, nr)
+	),
+	TP_fast_assign(
+		__entry->dev = mp ? mp->m_super->s_dev : 0;
+		__entry->type = dfp->dfp_type;
+		__entry->intent = dfp->dfp_intent;
+		__entry->item = item;
+		__entry->committed = dfp->dfp_done != NULL;
+		__entry->nr = dfp->dfp_count;
+	),
+	TP_printk("dev %d:%d optype %d intent %p item %p committed %d nr %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->type,
+		  __entry->intent,
+		  __entry->item,
+		  __entry->committed,
+		  __entry->nr)
+)
+#define DEFINE_DEFER_PENDING_ITEM_EVENT(name) \
+DEFINE_EVENT(xfs_defer_pending_item_class, name, \
+	TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp, \
+		 void *item), \
+	TP_ARGS(mp, dfp, item))
+
+DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_add_item);
+DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_cancel_item);
+DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_finish_item);
+
 /* rmap tracepoints */
 DECLARE_EVENT_CLASS(xfs_rmap_class,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
@@ -4325,6 +4364,39 @@ TRACE_EVENT(xfs_force_shutdown,
 		__entry->line_num)
 );
 
+#ifdef CONFIG_XFS_DRAIN_INTENTS
+DECLARE_EVENT_CLASS(xfs_perag_intents_class,
+	TP_PROTO(struct xfs_perag *pag, void *caller_ip),
+	TP_ARGS(pag, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(long, nr_intents)
+		__field(void *, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = pag->pag_mount->m_super->s_dev;
+		__entry->agno = pag->pag_agno;
+		__entry->nr_intents = atomic_read(&pag->pag_intents_drain.dr_count);
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d agno 0x%x intents %ld caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->nr_intents,
+		  __entry->caller_ip)
+);
+
+#define DEFINE_PERAG_INTENTS_EVENT(name)	\
+DEFINE_EVENT(xfs_perag_intents_class, name,					\
+	TP_PROTO(struct xfs_perag *pag, void *caller_ip), \
+	TP_ARGS(pag, caller_ip))
+DEFINE_PERAG_INTENTS_EVENT(xfs_perag_intent_hold);
+DEFINE_PERAG_INTENTS_EVENT(xfs_perag_intent_rele);
+DEFINE_PERAG_INTENTS_EVENT(xfs_perag_wait_intents);
+
+#endif /* CONFIG_XFS_DRAIN_INTENTS */
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH